<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article" dtd-version="2.0">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JMIR</journal-id>
      <journal-id journal-id-type="nlm-ta">J Med Internet Res</journal-id>
      <journal-title>Journal of Medical Internet Research</journal-title>
      <issn pub-type="epub">1438-8871</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
      <article-id pub-id-type="publisher-id">v26i1e60231</article-id>
      <article-id pub-id-type="pmid">39689306</article-id>
      <article-id pub-id-type="doi">10.2196/60231</article-id>
      <article-categories>
        <subj-group subj-group-type="heading">
          <subject>Original Paper</subject>
        </subj-group>
        <subj-group subj-group-type="article-type">
          <subject>Original Paper</subject>
        </subj-group>
      </article-categories>
      <title-group>
        <article-title>Sample Size Requirements for Popular Classification Algorithms in Tabular Clinical Data: Empirical Study</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="editor">
          <name>
            <surname>Tsafnat</surname>
            <given-names>Guy</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Figueroa</surname>
            <given-names>Rosa</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Gad</surname>
            <given-names>Ahmed G</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Hasan</surname>
            <given-names>Md Rakibul</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib id="contrib1" contrib-type="author" corresp="yes">
          <name name-style="western">
            <surname>Silvey</surname>
            <given-names>Scott</given-names>
          </name>
          <degrees>MS</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <address>
            <institution>Department of Biostatistics</institution>
            <institution>School of Public Health</institution>
            <institution>Virginia Commonwealth University</institution>
            <addr-line>830 East Main Street</addr-line>
            <addr-line>Richmond, VA, 23219</addr-line>
            <country>United States</country>
            <phone>1 4348256974</phone>
            <email>silveys@vcu.edu</email>
          </address>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-4540-4239</ext-link>
        </contrib>
        <contrib id="contrib2" contrib-type="author">
          <name name-style="western">
            <surname>Liu</surname>
            <given-names>Jinze</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-0555-9412</ext-link>
        </contrib>
      </contrib-group>
      <aff id="aff1">
        <label>1</label>
        <institution>Department of Biostatistics</institution>
        <institution>School of Public Health</institution>
        <institution>Virginia Commonwealth University</institution>
        <addr-line>Richmond, VA</addr-line>
        <country>United States</country>
      </aff>
      <author-notes>
        <corresp>Corresponding Author: Scott Silvey <email>silveys@vcu.edu</email></corresp>
      </author-notes>
      <pub-date pub-type="collection">
        <year>2024</year>
      </pub-date>
      <pub-date pub-type="epub">
        <day>17</day>
        <month>12</month>
        <year>2024</year>
      </pub-date>
      <volume>26</volume>
      <elocation-id>e60231</elocation-id>
      <history>
        <date date-type="received">
          <day>6</day>
          <month>5</month>
          <year>2024</year>
        </date>
        <date date-type="rev-request">
          <day>27</day>
          <month>7</month>
          <year>2024</year>
        </date>
        <date date-type="rev-recd">
          <day>20</day>
          <month>9</month>
          <year>2024</year>
        </date>
        <date date-type="accepted">
          <day>20</day>
          <month>10</month>
          <year>2024</year>
        </date>
      </history>
      <copyright-statement>©Scott Silvey, Jinze Liu. Originally published in the Journal of Medical Internet Research (https://www.jmir.org), 17.12.2024.</copyright-statement>
      <copyright-year>2024</copyright-year>
      <license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
        <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (https://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in the Journal of Medical Internet Research (ISSN 1438-8871), is properly cited. The complete bibliographic information, a link to the original publication on https://www.jmir.org/, as well as this copyright and license information must be included.</p>
      </license>
      <self-uri xlink:href="https://www.jmir.org/2024/1/e60231" xlink:type="simple"/>
      <abstract>
        <sec sec-type="background">
          <title>Background</title>
          <p>The performance of a classification algorithm eventually reaches a point of diminishing returns, where the additional sample added does not improve the results. Thus, there is a need to determine an optimal sample size that maximizes performance while accounting for computational burden or budgetary concerns.</p>
        </sec>
        <sec sec-type="objective">
          <title>Objective</title>
          <p>This study aimed to determine optimal sample sizes and the relationships between sample size and dataset-level characteristics over a variety of binary classification algorithms.</p>
        </sec>
        <sec sec-type="methods">
          <title>Methods</title>
          <p>A total of 16 large open-source datasets were collected, each containing a binary clinical outcome. Furthermore, 4 machine learning algorithms were assessed: XGBoost (XGB), random forest (RF), logistic regression (LR), and neural networks (NNs). For each dataset, the cross-validated area under the curve (AUC) was calculated at increasing sample sizes, and learning curves were fit. Sample sizes needed to reach the observed full–dataset AUC minus 2 points (0.02) were calculated from the fitted learning curves and compared across the datasets and algorithms. Dataset–level characteristics, minority class proportion, full–dataset AUC, number of features, type of features, and degree of nonlinearity were examined. Negative binomial regression models were used to quantify relationships between these characteristics and expected sample sizes within each algorithm. A total of 4 multivariable models were constructed, which selected the best-fitting combination of dataset–level characteristics.</p>
        </sec>
        <sec sec-type="results">
          <title>Results</title>
          <p>Among the 16 datasets (full-dataset sample sizes ranging from 70,000-1,000,000), median sample sizes were 9960 (XGB), 3404 (RF), 696 (LR), and 12,298 (NN) to reach AUC stability. For all 4 algorithms, more balanced classes (multiplier: 0.93-0.96 for a 1% increase in minority class proportion) were associated with decreased sample size. Other characteristics varied in importance across algorithms—in general, more features, weaker features, and more complex relationships between the predictors and the response increased expected sample sizes. In multivariable analysis, the top selected predictors were minority class proportion among all 4 algorithms assessed, full–dataset AUC (XGB, RF, and NN), and dataset nonlinearity (XGB, RF, and NN). For LR, the top predictors were minority class proportion, percentage of strong linear features, and number of features. Final multivariable sample size models had high goodness-of-fit, with dataset–level predictors explaining a majority (66.5%-84.5%) of the total deviance in the data among all 4 models.</p>
        </sec>
        <sec sec-type="conclusions">
          <title>Conclusions</title>
          <p>The sample sizes needed to reach AUC stability among 4 popular classification algorithms vary by dataset and method and are associated with dataset–level characteristics that can be influenced or estimated before the start of a research study.</p>
        </sec>
      </abstract>
      <kwd-group>
        <kwd>medical informatics</kwd>
        <kwd>machine learning</kwd>
        <kwd>sample size</kwd>
        <kwd>research design</kwd>
        <kwd>decision trees</kwd>
        <kwd>classification algorithm</kwd>
        <kwd>clinical research</kwd>
        <kwd>learning-curve analysis</kwd>
        <kwd>analysis</kwd>
        <kwd>analyses</kwd>
        <kwd>guidelines</kwd>
        <kwd>ML</kwd>
        <kwd>decision making</kwd>
        <kwd>algorithm</kwd>
        <kwd>curve analysis</kwd>
        <kwd>dataset</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <sec sec-type="introduction">
      <title>Introduction</title>
      <sec>
        <title>Background</title>
        <p>Machine learning (ML) is becoming increasingly popular within the domain of health care data analysis and clinical decision-making [<xref ref-type="bibr" rid="ref1">1</xref>]. The lack of a fixed model specification and distributional assumptions allows for these methods to learn complex relationships that are not necessarily linear in nature, such as high-order interactions and polynomial effects. Due to this, most popular machine-learning algorithms require much larger sample sizes than traditional statistical methods [<xref ref-type="bibr" rid="ref2">2</xref>]. However, exact amounts are not clear, and there are many different ML algorithms, each containing its own limitations and properties [<xref ref-type="bibr" rid="ref3">3</xref>]. Furthermore, in traditional statistical analysis, we can often analytically derive equations that measure how the sample size needed to detect a certain prespecified effect will behave under certain assumptions [<xref ref-type="bibr" rid="ref4">4</xref>]. Due to the data-driven and algorithmic nature of ML methods, which rely on computational approaches rather than statistical theory to capture relationships, an empirical approach is necessary in order to understand the behavior of these methods under varying conditions.</p>
        <p>It is known that, for any given dataset, there is a point where adding additional samples will not increase the performance metrics of the model considerably [<xref ref-type="bibr" rid="ref5">5</xref>]. Thus, it becomes important to collect enough data to optimize these metrics while also accounting for this performance ceiling and the budgetary or computational concerns that may arise when collecting substantial amounts of unnecessary data.</p>
        <p>Another reason for the difficulty in selecting a proper sample size when applying ML is the lack of a true end point or common metric of interest. As discussed previously, the traditional target for sample size determination methods is the statistical power to detect a certain effect size [<xref ref-type="bibr" rid="ref4">4</xref>]. In ML, since predictive performance rather than parameter estimation is usually of interest, this end point becomes unclear. A commonly used metric of predictive performance is prediction accuracy, defined as the proportion of correct classifications made [<xref ref-type="bibr" rid="ref6">6</xref>]. However, the prediction accuracy is related to the distribution of the outcome; for a rare event, accuracy can be high even with a completely noninformative model [<xref ref-type="bibr" rid="ref7">7</xref>]. As a result, a fairer performance metric is the area under the receiver operating characteristic curve (area under the curve [AUC]), which evaluates model predictions over a range of probability thresholds from 0 to 1 [<xref ref-type="bibr" rid="ref8">8</xref>]. AUC is widely used to evaluate the performance of an ML algorithm and has several desirable properties. First is interpretability—a higher AUC indicates a higher degree of separability, and an AUC of 0.5 implies a completely random prediction, while an AUC of 1.0 indicates perfect classification. A second desirable property of AUC is insensitivity to the proportion of cases versus controls in the dataset [<xref ref-type="bibr" rid="ref9">9</xref>]; because the entire range of probability thresholds is considered, AUC can also be considered the “average” sensitivity (true-positive rate) over all possible values of specificity (true-negative rate). While the AUC is commonly used to evaluate the initial performance of an ML algorithm, other metrics, such as calibration [<xref ref-type="bibr" rid="ref10">10</xref>], may also be preferred once modeling reaches later stages. However, it is necessary to ensure that a trained ML model can first make stable predictions before assessing further metrics such as calibration or threshold-selection.</p>
      </sec>
      <sec>
        <title>Related Works</title>
        <p>The concept of empirically estimating the performance of a classification algorithm as the training set size increases has been widely explored in a variety of different settings. This is typically done by creating a “learning curve,” measuring a metric (such as classification accuracy) as a function of sample size [<xref ref-type="bibr" rid="ref11">11</xref>]. Perlich et al [<xref ref-type="bibr" rid="ref12">12</xref>] compared logistic regression (LR) approaches versus decision-tree–based approaches, demonstrating that LR often outperforms tree induction in small samples, but decision trees excel as the sample size becomes large. Mukherjee et al [<xref ref-type="bibr" rid="ref13">13</xref>] developed a method to assess the error rate of a classifier as a function of sample size using an inverse power-law model. Their method was introduced in the context of DNA microarray data, which often contains a large amount of features and limited access to samples due to cost restraints. Figueroa et al [<xref ref-type="bibr" rid="ref14">14</xref>] modified the original learning curve fitting process by using nonlinear weighted least squares to favor future predictions, using 3 moderately sized datasets to demonstrate their algorithm. Provost et al [<xref ref-type="bibr" rid="ref15">15</xref>] used learning curves and efficient progressive sampling to show that classification algorithms eventually converge to a stable accuracy with increasing sample size, mainly focusing on the methodology of the sampling scheme. More recently, van der Ploeg et al [<xref ref-type="bibr" rid="ref16">16</xref>] used several clinical datasets and a simulation-based approach to show that modern classification algorithms such as neural networks (NNs) and random forest (RF) require at least 200 events per variable to reach a stable AUC. Richter and Khoshgoftaar [<xref ref-type="bibr" rid="ref17">17</xref>] experimented with learning curves on biomedical big data with limited labels and heavy class imbalance, using 1% of the full dataset AUC as their stopping rule. Because the cost of labeling certain types of data is expensive, it is important to maximize the quality of the data while minimizing costs. They found that a semisupervised approach and pseudolabeled data generated from a small amount of actual data could accurately predict future performance.</p>
      </sec>
      <sec>
        <title>Study Aim</title>
        <p>Previous contributions have focused on the methodology of learning curve fitting or estimating future performance from an already-collected sample. In those that have examined similar end points (ie, AUC plateau or stability over a variety of algorithms), the number of real-world datasets included was small, modern gradient boosting techniques (XGBoost [XGB], etc) were not examined because they had not yet been developed, and the impact of dataset–level characteristics on sample sizes was not extensively studied [<xref ref-type="bibr" rid="ref15">15</xref>,<xref ref-type="bibr" rid="ref16">16</xref>]. Previous literature has also mostly used small datasets in the context of -omics type data. In general, a focused clinical study often contains fewer features, a wide variety of variable types (ie, numeric, categorical, ordinal, etc), and fewer correlated features than -omics data [<xref ref-type="bibr" rid="ref18">18</xref>]. This study aims to develop algorithm-specific sample size guidelines using dataset–level variables that can be estimated or manipulated by researchers before any data has been collected, analogous to a sample size calculation performed in a traditional power analysis. The focus of these guidelines was on the stable internal validity of each method, which is typically the first benchmark used to assess performance when attempting to develop a predictive model. We examined 4 popular binary classification algorithms in the context of clinical research, where the aim is to predict a health-related outcome such as a disease state or event. The contributions of this study include a learning curve analysis of 16 real-world datasets, an examination of modern gradient-boosting methods (XGB) within this analysis, and concrete sample size guidelines based on dataset-level characteristics.</p>
      </sec>
    </sec>
    <sec sec-type="methods">
      <title>Methods</title>
      <sec>
        <title>Dataset Description</title>
        <p>We have collected 16 public-access clinical datasets ranging from sample sizes of 70,000-1,000,000. A detailed description of dataset sources, variables included, and outcomes can be found in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>. It should be noted that 8 of these 16 datasets were artificially created from smaller real-life datasets using Bayesian Network Generation, and their details have been previously discussed [<xref ref-type="bibr" rid="ref19">19</xref>]. All datasets contained a single binary outcome, such as a disease state, with a combination of continuous numeric, discrete numeric, or binary predictors. Continuous numeric features were considered to have at least 10 unique values.</p>
        <p>A detailed description of specific data preprocessing steps can be found in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>. In summary, nominal variables were converted to binary variables based on arbitrary binning rules, and variables containing text values (ie, “gender”: male vs female) were also converted to binary variables. Missing data was present in 3 datasets only (CDC Heart Disease [2022], Diabetes130, and COVID-19), although the amount of missingness was quite low among these sets (1.3%, &#60;1%, and &#60;1%, respectively). Without knowing additional information regarding the nature of these missing values, we considered them missing completely at random (MCAR) and performed mean imputation [<xref ref-type="bibr" rid="ref20">20</xref>].</p>
      </sec>
      <sec>
        <title>Classification Algorithms</title>
        <p>We examined the following binary classifiers on each dataset: Logistic Regression (LR) [<xref ref-type="bibr" rid="ref21">21</xref>], Random Forest (RF) [<xref ref-type="bibr" rid="ref22">22</xref>], XGBoost (XGB) [<xref ref-type="bibr" rid="ref23">23</xref>], and Neural Networks (NNs) [<xref ref-type="bibr" rid="ref24">24</xref>]. These algorithms were selected due to their widespread and popular use in clinical data analysis [<xref ref-type="bibr" rid="ref25">25</xref>]. We performed LR by fitting a multivariable model using all predictors without any variable selection or regularization methods. For the RF and XGB algorithms, hyperparameters were left at their default values, which can be found in the R documentation [<xref ref-type="bibr" rid="ref22">22</xref>,<xref ref-type="bibr" rid="ref23">23</xref>]. For NNs, we used the R (R Foundation for Statistical Computing) package <italic>h2o</italic> [<xref ref-type="bibr" rid="ref26">26</xref>] to perform our analyses; we considered one hidden layer with 20 units and 10 epochs of data training. The type of NN used by “h2o” is a multilayer feedforward artificial NN, also known as multilayer perceptron (MLP). The activation function used in the hidden layers was the default linear rectifier, with softmax activation in the final output nodes for probability estimation and classification. Other NN hyperparameters were again left at their default values, which can be found in the documentation [<xref ref-type="bibr" rid="ref26">26</xref>].</p>
      </sec>
      <sec>
        <title>Learning Curve Approach</title>
        <p>From the 16 datasets studied, we evaluated the cross-validated area under the curve (CV-AUC) as a function of increasing sample size. Cross-validation is a method of assessing the internal validity of a classifier; it works by splitting the entire training dataset into <italic>k</italic> folds and fitting <italic>k</italic> models, with a single fold left out for evaluation in each model [<xref ref-type="bibr" rid="ref27">27</xref>]. The final cross-validated metric (in our case, AUC) is calculated by taking the average performance over all <italic>k</italic> of the held-out folds. As a result, the entire training dataset is used to generate an estimate of out-of-sample performance. The learning curve approach is detailed below:</p>
        <list list-type="order">
          <list-item>
            <p>Create a list of proposed training set sizes.</p>
          </list-item>
          <list-item>
            <p>At each point in the sample size interval, randomly sample 10 subdatasets of size n from the full dataset.</p>
          </list-item>
          <list-item>
            <p>In each of the 10 subdatasets, estimate the (5-fold, outcome-stratified) CV-AUC on the proposed algorithm of choice. Average the ten CV-AUC values to generate an estimate of out-of-sample performance at a given n.</p>
          </list-item>
          <list-item>
            <p>Repeat at the next n in the list.</p>
          </list-item>
        </list>
        <p>For the first step, the training set size list usually consisted of 10 evenly spaced points ranging from n=500 to n=50,000, but if stability was not reached by n=50,000, the end point was extended. For LR, the final n was lower, as the AUC from these models typically became stable much earlier than more complex ML algorithms. A full description of the sample size intervals used for each dataset and each algorithm can be found in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> (Table S1). Stability was defined as the smallest n where the CV-AUC was within 2 points (0.02) of the observed full–dataset AUC. For example, if the full–dataset AUC was 0.85, we would obtain the smallest n where a CV-AUC of 0.83 was first surpassed. The full–dataset AUC for each classification algorithm was calculated using 5-fold stratified cross-validation (CV) on the entire dataset. We chose this stopping point of 0.02 because, although arbitrary, we believed that it provided the most reasonable trade-off between high performance and computational burden. Specifically, as can be seen visually from figures in the Learning Curve Results section, this stopping point typically marks the beginning of the “point of diminishing returns,” where the power law curves begin to plateau, and the amount of additional sample needed to make further improvements increases exponentially. It is important to note that once the learning curve equations are estimated, this choice of stopping point can be freely altered. Therefore, although we report 0.02 in this study as our stopping rule of interest, the final equations derived below can be re-estimated with any user-specified stopping point (for example, 0.01 or 0.05). We present sample size results using alternative thresholds of 0.01 and 0.05 AUC points from the full–dataset AUC in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p>
        <p>Once the raw data was generated, estimated learning curves were fit using nonlinear least squares optimization [<xref ref-type="bibr" rid="ref28">28</xref>], following the power law equation: AUC<italic><sub>(</sub></italic><sub>n</sub><italic><sub>)</sub></italic> = an<sup>b</sup>+c, where a and b were estimated, and c was either fixed to be the full–dataset AUC or was also estimated, depending on the quality of the fit. For some datasets and algorithms, the power law function did not fit the data well. These were typically scenarios where the dataset required a relatively larger sample size to become stable. In these scenarios, we instead fit the learning curves using a logarithmic function, AUC<sub>(n)</sub>=β<sub>0</sub>+β<sub>1</sub>*log(n), where β<sub>0</sub> and β<sub>1</sub> were estimated using ordinary least-squares [<xref ref-type="bibr" rid="ref29">29</xref>].</p>
      </sec>
      <sec>
        <title>Sample Size Determination and Guidelines</title>
        <p>Following the learning curve analysis of the 4 selected algorithms on our datasets, we examined the effects of 6 dataset–level factors on the sample sizes needed for AUC stability. These included minority class proportion (maximum value of 50%, indicating no class imbalance), separability (defined as the full–dataset AUC itself), the total number of features, the percentage of features that were continuous (versus binary or discrete numeric), the percentage of “core linear” features, and “dataset nonlinearity.” Core linear features were determined by adding an L1 (LASSO [least absolute shrinkage and selection operator]) penalty to the LR model for each full dataset [<xref ref-type="bibr" rid="ref30">30</xref>]. The percentage of variables that did not shrink to zero when this penalty was added were defined as core linear features. Dataset nonlinearity was a rough measure of the degree of nonlinear or interactive relationships between the predictors and the outcomes that were present in the data. This was defined as the point difference in the full–dataset AUC when using a complex algorithm (XGB) compared with LR. For example, if LR yielded a full–dataset AUC of 0.90 and XGB yielded a full–dataset AUC of 0.95, the dataset nonlinearity would be calculated as 5.0. For the purpose of calculating these values, XGB hyperparameters were left at their default values [<xref ref-type="bibr" rid="ref26">26</xref>].</p>
        <p>Within the context of each algorithm, the relationship between these dataset–level variables and the n required for AUC stability was examined. Since the estimated sample sizes were discrete and right-skewed numeric values, we used negative binomial regression models [<xref ref-type="bibr" rid="ref31">31</xref>] to quantify the strength and significance of each dataset characteristic on predicted sample sizes, which produce coefficients in terms of log-expected counts. Then, in multivariable negative binomial regression models for each algorithm, we selected up to 3 dataset–level predictors that together minimized the Akaike Information Criterion, which evaluates how well the model fits the data while penalizing for the number of parameters estimated [<xref ref-type="bibr" rid="ref32">32</xref>]. A maximum of 3 predictors per model were considered in order to avoid potential overfitting.</p>
        <p>We also calculated adjusted deviance-based pseudo-<italic>R</italic><sup>2</sup> statistics [<xref ref-type="bibr" rid="ref33">33</xref>,<xref ref-type="bibr" rid="ref34">34</xref>], which further quantified each model’s goodness-of-fit and proportion of deviance explained by the predictors. The final model equations were reported and discussed for each algorithm, and visualizations of the model predictions at varying levels of each dataset-level characteristic were generated. Statistical significance was set to α=.05 for all hypothesis tests considered, and RStudio (version 4.2.3; R Foundation for Statistical Computing) was used for all analyses.</p>
      </sec>
    </sec>
    <sec sec-type="results">
      <title>Results</title>
      <p>A workflow and overview of the study’s aims and end points can be seen in <xref rid="figure1" ref-type="fig">Figure 1</xref>.</p>
      <p>We gathered 16 datasets with sample sizes ranging from 70,000 to 1,000,000 (<xref ref-type="table" rid="table1">Table 1</xref>). Out of the 4 classification algorithms examined, XGB performed the best or tied for the best performance on 14/16 (87.5%) datasets, while RF performed the best on two. Full dataset AUCs (separability) ranged from 0.608-0.979 (XGB), 0.609-0.976 (RF), 0.596-0.949 (LR), and 0.603-0.974 (NN; <xref ref-type="table" rid="table2">Table 2</xref>). As expected, LR models generally performed the worst, with full–dataset AUCs that were 0.028 points lower on average compared with XGB.</p>
      <fig id="figure1" position="float">
        <label>Figure 1</label>
        <caption>
          <p>Flowchart of study aims and deliverables. AUC: area under the curve.</p>
        </caption>
        <graphic xlink:href="jmir_v26i1e60231_fig1.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
      </fig>
      <table-wrap position="float" id="table1">
        <label>Table 1</label>
        <caption>
          <p>Dataset–level characteristics.</p>
        </caption>
        <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
          <col width="190"/>
          <col width="90"/>
          <col width="150"/>
          <col width="130"/>
          <col width="100"/>
          <col width="120"/>
          <col width="120"/>
          <col width="100"/>
          <thead>
            <tr valign="top">
              <td>Dataset name</td>
              <td>Source</td>
              <td>Full dataset size, n</td>
              <td>Minority class proportion (%)</td>
              <td>Features, n</td>
              <td>Core linear features (%)</td>
              <td>Continuous features (%)</td>
              <td>Nonlinearity</td>
            </tr>
          </thead>
          <tbody>
            <tr valign="top">
              <td>Cardio</td>
              <td>OpenML</td>
              <td>70,000</td>
              <td>50</td>
              <td>11</td>
              <td>100</td>
              <td>45.5</td>
              <td>1.8</td>
            </tr>
            <tr valign="top">
              <td>Diabetes130</td>
              <td>OpenML</td>
              <td>101,766</td>
              <td>46.1</td>
              <td>35</td>
              <td>45.7</td>
              <td>20</td>
              <td>0.8</td>
            </tr>
            <tr valign="top">
              <td>NoShow</td>
              <td>OpenML</td>
              <td>110,527</td>
              <td>20.2</td>
              <td>8</td>
              <td>25</td>
              <td>12.5</td>
              <td>1.2</td>
            </tr>
            <tr valign="top">
              <td>BreastTumor</td>
              <td>OpenML</td>
              <td>116,640</td>
              <td>34.6</td>
              <td>9</td>
              <td>44.4</td>
              <td>33.3</td>
              <td>9.5</td>
            </tr>
            <tr valign="top">
              <td>Diabetes</td>
              <td>UCI</td>
              <td>253,680</td>
              <td>13.9</td>
              <td>21</td>
              <td>85.7</td>
              <td>19</td>
              <td>0.7</td>
            </tr>
            <tr valign="top">
              <td>COVID-19</td>
              <td>OpenML</td>
              <td>263,007</td>
              <td>39</td>
              <td>16</td>
              <td>37.5</td>
              <td>6.25</td>
              <td>2.2</td>
            </tr>
            <tr valign="top">
              <td>LOS</td>
              <td>OpenML</td>
              <td>318,438</td>
              <td>2.1</td>
              <td>11</td>
              <td>36.4</td>
              <td>27.3</td>
              <td>1.9</td>
            </tr>
            <tr valign="top">
              <td>CDC Heart Disease (2020)</td>
              <td>Kaggle</td>
              <td>319,795</td>
              <td>4.4</td>
              <td>17</td>
              <td>76.5</td>
              <td>23.5</td>
              <td>0.5</td>
            </tr>
            <tr valign="top">
              <td>CDC Heart Disease (2022)</td>
              <td>Kaggle</td>
              <td>394,509</td>
              <td>8.6</td>
              <td>39</td>
              <td>59</td>
              <td>15.4</td>
              <td>0.6</td>
            </tr>
            <tr valign="top">
              <td>Heart</td>
              <td>OpenML</td>
              <td>1,000,000</td>
              <td>44.4</td>
              <td>13</td>
              <td>92.3</td>
              <td>46.2</td>
              <td>1.6</td>
            </tr>
            <tr valign="top">
              <td>Hepatitis</td>
              <td>OpenML</td>
              <td>1,000,000</td>
              <td>20.8</td>
              <td>19</td>
              <td>94.7</td>
              <td>31.6</td>
              <td>4.0</td>
            </tr>
            <tr valign="top">
              <td>Lymph</td>
              <td>OpenML</td>
              <td>1,000,000</td>
              <td>45.7</td>
              <td>18</td>
              <td>83.3</td>
              <td>5.6</td>
              <td>2.3</td>
            </tr>
            <tr valign="top">
              <td>Pharynx</td>
              <td>OpenML</td>
              <td>1,000,000</td>
              <td>25.6</td>
              <td>11</td>
              <td>72.2</td>
              <td>18.2</td>
              <td>1.5</td>
            </tr>
            <tr valign="top">
              <td>Cholesterol</td>
              <td>OpenML</td>
              <td>1,000,000</td>
              <td>16.5</td>
              <td>13</td>
              <td>61.5</td>
              <td>30.8</td>
              <td>6.7</td>
            </tr>
            <tr valign="top">
              <td>Dermatology</td>
              <td>OpenML</td>
              <td>1,000,000</td>
              <td>13.2</td>
              <td>33</td>
              <td>84.8</td>
              <td>3</td>
              <td>3.6</td>
            </tr>
            <tr valign="top">
              <td>PBC</td>
              <td>OpenML</td>
              <td>1,000,000</td>
              <td>17.8</td>
              <td>18</td>
              <td>83.3</td>
              <td>55.6</td>
              <td>6.2</td>
            </tr>
          </tbody>
        </table>
      </table-wrap>
      <table-wrap position="float" id="table2">
        <label>Table 2</label>
        <caption>
          <p>Full–dataset AUC (separability) for each algorithm.</p>
        </caption>
        <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
          <col width="400"/>
          <col width="150"/>
          <col width="150"/>
          <col width="150"/>
          <col width="150"/>
          <thead>
            <tr valign="top">
              <td>Dataset name</td>
              <td>XGB<sup>a</sup></td>
              <td>RF<sup>b</sup></td>
              <td>LR<sup>c</sup></td>
              <td>NN<sup>d</sup></td>
            </tr>
          </thead>
          <tbody>
            <tr valign="top">
              <td>Cardio</td>
              <td>0.802</td>
              <td>0.796</td>
              <td>0.784</td>
              <td>0.795</td>
            </tr>
            <tr valign="top">
              <td>Diabetes130</td>
              <td>0.662</td>
              <td>0.661</td>
              <td>0.654</td>
              <td>0.661</td>
            </tr>
            <tr valign="top">
              <td>NoShow</td>
              <td>0.608</td>
              <td>0.609</td>
              <td>0.596</td>
              <td>0.603</td>
            </tr>
            <tr valign="top">
              <td>BreastTumor</td>
              <td>0.777</td>
              <td>0.780</td>
              <td>0.682</td>
              <td>0.730</td>
            </tr>
            <tr valign="top">
              <td>Diabetes</td>
              <td>0.829</td>
              <td>0.822</td>
              <td>0.822</td>
              <td>0.826</td>
            </tr>
            <tr valign="top">
              <td>COVID-19</td>
              <td>0.664</td>
              <td>0.661</td>
              <td>0.642</td>
              <td>0.661</td>
            </tr>
            <tr valign="top">
              <td>LOS</td>
              <td>0.917</td>
              <td>0.915</td>
              <td>0.898</td>
              <td>0.901</td>
            </tr>
            <tr valign="top">
              <td>CDC Heart Disease (2020)</td>
              <td>0.815</td>
              <td>0.810</td>
              <td>0.810</td>
              <td>0.809</td>
            </tr>
            <tr valign="top">
              <td>CDC Heart Disease (2022)</td>
              <td>0.815</td>
              <td>0.801</td>
              <td>0.809</td>
              <td>0.801</td>
            </tr>
            <tr valign="top">
              <td>Heart</td>
              <td>0.965</td>
              <td>0.963</td>
              <td>0.949</td>
              <td>0.962</td>
            </tr>
            <tr valign="top">
              <td>Hepatitis</td>
              <td>0.979</td>
              <td>0.976</td>
              <td>0.939</td>
              <td>0.974</td>
            </tr>
            <tr valign="top">
              <td>Lymph</td>
              <td>0.957</td>
              <td>0.957</td>
              <td>0.934</td>
              <td>0.956</td>
            </tr>
            <tr valign="top">
              <td>Pharynx</td>
              <td>0.858</td>
              <td>0.858</td>
              <td>0.843</td>
              <td>0.856</td>
            </tr>
            <tr valign="top">
              <td>Cholesterol</td>
              <td>0.736</td>
              <td>0.728</td>
              <td>0.669</td>
              <td>0.714</td>
            </tr>
            <tr valign="top">
              <td>Dermatology</td>
              <td>0.859</td>
              <td>0.857</td>
              <td>0.823</td>
              <td>0.852</td>
            </tr>
            <tr valign="top">
              <td>PBC</td>
              <td>0.850</td>
              <td>0.850</td>
              <td>0.788</td>
              <td>0.823</td>
            </tr>
          </tbody>
        </table>
        <table-wrap-foot>
          <fn id="table2fn1">
            <p><sup>a</sup>XGB: XGBoost.</p>
          </fn>
          <fn id="table2fn2">
            <p><sup>b</sup>RF: random forest.</p>
          </fn>
          <fn id="table2fn3">
            <p><sup>c</sup>LR: logistic regression.</p>
          </fn>
          <fn id="table2fn4">
            <p><sup>d</sup>NN: neural network.</p>
          </fn>
        </table-wrap-foot>
      </table-wrap>
      <sec>
        <title>Learning Curve Results</title>
        <p>Learning curves were fit to the 16 collected datasets. <xref ref-type="table" rid="table3">Table 3</xref> and <xref rid="figure2" ref-type="fig">Figure 2</xref> contain a full summary and visualization of estimated sample sizes across each classification algorithm and dataset. NNs required the largest sample sizes to reach stability and also had the most variability among the datasets (median 12,298, range 1824-180,835). LR required the smallest sample size to reach stability and also was the least variable (median 696, range 204-6798). XGB required approximately 3 times the sample size compared with RF, but the range of estimated sample sizes generated from RF models was nearly twice as wide (<xref ref-type="table" rid="table2">Table 2</xref>). <xref rid="figure3" ref-type="fig">Figure 3</xref> shows the fitted learning curves for each algorithm generated within each dataset, with a marker indicating the earliest sample size where the CV-AUC was within 2 points of the full–dataset AUC.</p>
        <table-wrap position="float" id="table3">
          <label>Table 3</label>
          <caption>
            <p>Sample sizes are needed to reach AUC stability from the learning curve analysis.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="250"/>
            <col width="180"/>
            <col width="180"/>
            <col width="180"/>
            <col width="210"/>
            <thead>
              <tr valign="top">
                <td>Dataset name</td>
                <td>XGB<sup>a</sup></td>
                <td>RF<sup>b</sup></td>
                <td>LR<sup>c</sup></td>
                <td>NN<sup>d</sup></td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>Cardio, n</td>
                <td>4341</td>
                <td>1476</td>
                <td>363</td>
                <td>4349</td>
              </tr>
              <tr valign="top">
                <td>Diabetes130, n</td>
                <td>9623</td>
                <td>3544</td>
                <td>1822</td>
                <td>16,823</td>
              </tr>
              <tr valign="top">
                <td>NoShow, n</td>
                <td>12,114</td>
                <td>2241</td>
                <td>742</td>
                <td>8084</td>
              </tr>
              <tr valign="top">
                <td>BreastTumor, n</td>
                <td>19,668</td>
                <td>17,383</td>
                <td>558</td>
                <td>28,424</td>
              </tr>
              <tr valign="top">
                <td>Diabetes, n</td>
                <td>9306</td>
                <td>2261</td>
                <td>1140</td>
                <td>8556</td>
              </tr>
              <tr valign="top">
                <td>COVID-19, n</td>
                <td>7026</td>
                <td>4750</td>
                <td>543</td>
                <td>4241</td>
              </tr>
              <tr valign="top">
                <td>LOS, n</td>
                <td>18,239</td>
                <td>15,381</td>
                <td>2555</td>
                <td>14,085</td>
              </tr>
              <tr valign="top">
                <td>CDC Heart Disease (2020), n</td>
                <td>15,177</td>
                <td>4995</td>
                <td>2243</td>
                <td>10,510</td>
              </tr>
              <tr valign="top">
                <td>CDC Heart Disease (2022), n</td>
                <td>30,534</td>
                <td>16,355</td>
                <td>6768</td>
                <td>25,120</td>
              </tr>
              <tr valign="top">
                <td>Heart, n</td>
                <td>960</td>
                <td>250</td>
                <td>204</td>
                <td>1824</td>
              </tr>
              <tr valign="top">
                <td>Hepatitis, n</td>
                <td>3513</td>
                <td>3265</td>
                <td>425</td>
                <td>15,302</td>
              </tr>
              <tr valign="top">
                <td>Lymph, n</td>
                <td>1409</td>
                <td>1992</td>
                <td>276</td>
                <td>4470</td>
              </tr>
              <tr valign="top">
                <td>Pharynx, n</td>
                <td>10,296</td>
                <td>2488</td>
                <td>317</td>
                <td>5260</td>
              </tr>
              <tr valign="top">
                <td>Cholesterol, n</td>
                <td>65,556</td>
                <td>140,499</td>
                <td>1368</td>
                <td>180,835</td>
              </tr>
              <tr valign="top">
                <td>Dermatology, n</td>
                <td>7979</td>
                <td>3103</td>
                <td>1696</td>
                <td>47,489</td>
              </tr>
              <tr valign="top">
                <td>PBC, n</td>
                <td>31,897</td>
                <td>71,194</td>
                <td>650</td>
                <td>53,453</td>
              </tr>
              <tr valign="top">
                <td>Median (Range)</td>
                <td>9960 (960-65,556)</td>
                <td>3404 (250-140,499)</td>
                <td>696 (204-6798)</td>
                <td>12,298 (1824-180,835)</td>
              </tr>
              <tr valign="top">
                <td>Mean (SD), Log-Transform</td>
                <td>9.16 (1.11)</td>
                <td>8.57 (1.55)</td>
                <td>6.75 (0.96)</td>
                <td>9.47 (1.17)</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table3fn1">
              <p><sup>a</sup>XGB: XGBoost.</p>
            </fn>
            <fn id="table3fn2">
              <p><sup>b</sup>RF: random forest.</p>
            </fn>
            <fn id="table3fn3">
              <p><sup>c</sup>LR: logistic regression.</p>
            </fn>
            <fn id="table3fn4">
              <p><sup>d</sup>NN: neural network.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <fig id="figure2" position="float">
          <label>Figure 2</label>
          <caption>
            <p>Visualization of expected sample sizes calculated from the learning-curve analysis of 16 data sets. LR: Logistic Regression. NN: Neural Networks. RF: Random Forest. XGB: XGBoost.</p>
          </caption>
          <graphic xlink:href="jmir_v26i1e60231_fig2.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <fig id="figure3" position="float">
          <label>Figure 3</label>
          <caption>
            <p>Fitted learning curves for 16 data sets across 4 classification algorithms. Different colors were chosen for different algorithms  Each black "X" represents the point where the AUC at size n first comes within 2-points (or, 0.02) of the asymptotic (ie full-data set) AUC. LR: Logistic Regression. NN: Neural Networks. RF: Random Forest. XGB: XGBoost. AUC: Area-under the receiver operating characteristic curve.</p>
          </caption>
          <graphic xlink:href="jmir_v26i1e60231_fig3.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Dataset–Level Characteristics</title>
        <p>Dataset–level characteristics were examined (<xref ref-type="table" rid="table1">Table 1</xref>). The average minority class percentage was 25.18% (SD 15.93%), and the average number of features was 18 (SD 9). The average percentage of continuous numeric features was 24.6% (SD 15.31%), and the average percentage of core linear features was 67.69% (SD 23.73%). The median dataset nonlinearity was 1.85 (range 0.50-9.50). Most datasets (13/16, 81.3%) had nonlinearity values under 5.0. Thus, for the purpose of model fitting, this was converted into a binary variable indicating either "high" (≥) or "low" (&#60;5) nonlinearity. Scatterplots examining the visual relationships between log-expected sample sizes and each dataset–level characteristic can be found in <xref rid="figure4" ref-type="fig">Figure 4</xref>.</p>
        <p>Negative binomial regression models were fitted, examining the individual associations between each of the dataset–level characteristics and predicted sample sizes (<xref ref-type="table" rid="table4">Table 4</xref>). In these models, separability (full–dataset AUC) was multiplied by 100 for easier interpretation. For example, an AUC of 0.80 was entered as 80.0 in the models. For XGB, minority class proportion and separability were both inversely related to sample size; for every 1-unit increase in separability (where 50.0 was the baseline value), estimated sample sizes were affected by a multiplier of 0.955 (<italic>P</italic>=.02). For every 1% increase in minority class proportion, estimated sample sizes were affected by a multiplier of 0.959 (<italic>P</italic>&#60;.001). In datasets with high (≥5.0) values of nonlinearity, estimated sample sizes were affected by a multiplier of 3.888 (<italic>P</italic>=.005). In the RF analyses, results were similar for minority class proportion (0.931× multiplier for 1% increase, <italic>P</italic>&#60;.001), separability (0.939× multiplier for each 1-unit increase over 50.0, <italic>P</italic>=.047), and nonlinearity (15.984× multiplier for those with high values, <italic>P</italic>&#60;.001). However, the percentage of continuous numeric features (1.065× multiplier for every 1% increase, <italic>P</italic>=.003) was also individually statistically significant. For LR, minority class proportion (0.963× multiplier for every 1% increase, <italic>P</italic>=.001), the number of features (1.056× multiplier for each additional feature, <italic>P</italic>=.006), the percentage of core features (0.982× multiplier for 1% increase, <italic>P</italic>=.046), and the percentage of continuous numeric features (0.971× multiplier for 1% increase, <italic>P</italic>=.04) were significantly associated with sample size. Again, a more balanced ratio of classes reduced the needed sample size, while more features increased the sample size. However, a higher percentage of core linear features and a higher percentage of continuous numeric features lowered the sample size. Finally, for NNs, results were similar to XGB; minority class proportion (0.953× multiplier for 1% increase, <italic>P</italic>=.003), full–dataset AUC (0.950× multiplier for each 1% increase over 50.0, <italic>P</italic>=.03), and nonlinearity (6.85× multiplier for high values, <italic>P</italic>&#60;.001) were all individually statistically significant.</p>
        <fig id="figure4" position="float">
          <label>Figure 4</label>
          <caption>
            <p>Relationships between each data set–level characteristic and expected sample sizes. The y-axis represents the natural-log transformed sample size values, while each x-axis represents varying levels of each data set–level characteristic. Separability multiplied by 100. All values representing proportions were multiplied by 100 so that 0 indicates 0% and 100 indicates 100%. Nonlinearity “low”: &#60;5, “high”: ≥5. LR: Logistic Regression. NN: Neural Networks. RF: Random Forest. XGB: XGBoost.</p>
          </caption>
          <graphic xlink:href="jmir_v26i1e60231_fig4.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <table-wrap position="float" id="table4">
          <label>Table 4</label>
          <caption>
            <p>Univariable association of each dataset–level characteristics with predicted sample size.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="160"/>
            <col width="150"/>
            <col width="60"/>
            <col width="0"/>
            <col width="150"/>
            <col width="60"/>
            <col width="0"/>
            <col width="150"/>
            <col width="60"/>
            <col width="0"/>
            <col width="0"/>
            <col width="0"/>
            <col width="150"/>
            <col width="60"/>
            <thead>
              <tr valign="top">
                <td>Variable</td>
                <td colspan="3">XGB<sup>a</sup></td>
                <td colspan="3">RF<sup>b</sup></td>
                <td colspan="4">LR<sup>c</sup></td>
                <td colspan="3">NN<sup>d</sup></td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Estimate (95% Confidence Interval)</td>
                <td><italic>P</italic><break/>value</td>
                <td colspan="2">Estimate (95% Confidence Interval)</td>
                <td><italic>P</italic><break/>value</td>
                <td colspan="2">Estimate (95% Confidence Interval)</td>
                <td><italic>P</italic><break/>value</td>
                <td colspan="4">Estimate (95% Confidence Interval)</td>
                <td><italic>P</italic><break/>value</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>Minority class proportion</td>
                <td>0.959 (0.934-0.985)</td>
                <td>&#60;.001</td>
                <td colspan="2">0.931 (0.891-0.977)</td>
                <td>&#60;.001</td>
                <td colspan="2">0.963 (0.945-0.983)</td>
                <td>.001</td>
                <td colspan="4">0.953 (0.921-0.990)</td>
                <td>.003</td>
              </tr>
              <tr valign="top">
                <td>Separability</td>
                <td>0.955 (0.903-1.000)</td>
                <td>.02</td>
                <td colspan="2">0.939 (0.861-1.028)</td>
                <td>.047</td>
                <td colspan="2">0.999 (0.946-1.051)</td>
                <td colspan="4">.98</td>
                <td>0.950 (0.898-1.007)</td>
                <td>.03</td>
              </tr>
              <tr valign="top">
                <td>Number of features</td>
                <td>1.000 (0.958-1.053)</td>
                <td>.99</td>
                <td colspan="2">0.966 (0.904-1.059)</td>
                <td>.35</td>
                <td colspan="2">1.056 (1.021-1.097)</td>
                <td colspan="4">.006</td>
                <td>1.001 (0.947-1.071)</td>
                <td>.98</td>
              </tr>
              <tr valign="top">
                <td>Continuous features (%)</td>
                <td>1.019 (0.986-1.057)</td>
                <td>.21</td>
                <td colspan="2">1.065 (1.013-1.121)</td>
                <td>.003</td>
                <td colspan="2">0.971 (0.942-1.006)</td>
                <td>.04</td>
                <td colspan="4">1.019 (0.982-1.059)</td>
                <td>.04</td>
              </tr>
              <tr valign="top">
                <td>Core linear Features (%)</td>
                <td>0.983 (0.958-1.001)</td>
                <td>.07</td>
                <td colspan="2">0.988 (0.941-1.034)</td>
                <td>.41</td>
                <td colspan="2">0.982 (0.959-1.004)</td>
                <td>.046</td>
                <td colspan="4">0.994 (0.960-1.028)</td>
                <td>.61</td>
              </tr>
              <tr valign="top">
                <td>Dataset nonlinearity</td>
                <td>3.889 (1.631-11.209)</td>
                <td>.005</td>
                <td colspan="2">15.985 (5.914-55.754)</td>
                <td>&#60;.001</td>
                <td colspan="2">0.585 (0.211-2.112)</td>
                <td>.35</td>
                <td colspan="4">6.853 (2.763-20.947)</td>
                <td>&#60;.001</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table4fn1">
              <p><sup>a</sup>XGB: XGBoost.</p>
            </fn>
            <fn id="table4fn2">
              <p><sup>b</sup>RF: random forest.</p>
            </fn>
            <fn id="table4fn3">
              <p><sup>c</sup>LR: logistic regression.</p>
            </fn>
            <fn id="table4fn4">
              <p><sup>d</sup>NN: neural network.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <p>In multivariable models for each algorithm, we selected the set of 3 predictors that minimized the AIC. The equation below, as well as <xref ref-type="table" rid="table5">Table 5</xref>, presents a summary of each algorithm-specific model, which shows the adjusted contribution of each predictor to the expected sample size.</p>
        <p>Equation 1: Empirically derived sample size equations for XGB, RF, LR, and NN algorithms.</p>
        <list list-type="order">
          <list-item>
            <p>
              <disp-formula>
                <graphic xlink:href="jmir_v26i1e60231_fig6.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
              </disp-formula>
            </p>
          </list-item>
          <list-item>
            <p>
              <disp-formula>
                <graphic xlink:href="jmir_v26i1e60231_fig7.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
              </disp-formula>
            </p>
          </list-item>
          <list-item>
            <p>
              <disp-formula>
                <graphic xlink:href="jmir_v26i1e60231_fig8.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
              </disp-formula>
            </p>
          </list-item>
          <list-item>
            <p>
              <disp-formula>
                <graphic xlink:href="jmir_v26i1e60231_fig9.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
              </disp-formula>
            </p>
          </list-item>
        </list>
        <table-wrap position="float" id="table5">
          <label>Table 5</label>
          <caption>
            <p>Multivariable negative binomial regression—data-level characteristics effect on predicted sample size.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="200"/>
            <col width="120"/>
            <col width="0"/>
            <col width="80"/>
            <col width="0"/>
            <col width="120"/>
            <col width="0"/>
            <col width="80"/>
            <col width="0"/>
            <col width="0"/>
            <col width="120"/>
            <col width="0"/>
            <col width="80"/>
            <col width="0"/>
            <col width="0"/>
            <col width="120"/>
            <col width="0"/>
            <col width="80"/>
            <thead>
              <tr valign="top">
                <td>Variable</td>
                <td colspan="4">XGB<sup>a</sup></td>
                <td colspan="5">RF<sup>b</sup></td>
                <td colspan="5">LR<sup>c</sup></td>
                <td colspan="3">NN<sup>d</sup></td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td colspan="2">Estimate (95% Confidence Interval)</td>
                <td colspan="2"><italic>P</italic> value</td>
                <td colspan="2">Estimate (95% Confidence Interval)</td>
                <td colspan="2"><italic>P</italic> value</td>
                <td colspan="3">Estimate (95% Confidence Interval)</td>
                <td colspan="2"><italic>P</italic> value</td>
                <td colspan="3">Estimate (95% Confidence Interval)</td>
                <td><italic>P</italic> value</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>Intercept</td>
                <td colspan="2">121,967 (57,883-262,108)</td>
                <td colspan="2">&#60;.001</td>
                <td colspan="2">26,872 (8118-92,378)</td>
                <td colspan="2">&#60;.001</td>
                <td colspan="3">1801 (904-3,809)</td>
                <td colspan="2">&#60;.001</td>
                <td colspan="3">36,819 (18,691-76,970)</td>
                <td>&#60;.001</td>
              </tr>
              <tr valign="top">
                <td>Minority class proportion</td>
                <td>0.956 (0.946-0.967)</td>
                <td colspan="2">&#60;.001</td>
                <td colspan="2">0.957 (0.940-0.975)</td>
                <td colspan="2">&#60;.001</td>
                <td colspan="3">0.968 (0.957-0.979)</td>
                <td colspan="2">&#60;.001</td>
                <td colspan="3">0.976 (0.957-0.996)</td>
                <td colspan="2">.02</td>
              </tr>
              <tr valign="top">
                <td>Separability</td>
                <td colspan="2">0.952 (0.934-0.970)</td>
                <td colspan="2">&#60;.001</td>
                <td colspan="2">0.975 (0.947-1.004)</td>
                <td colspan="2">.07</td>
                <td colspan="3">—<sup>e</sup></td>
                <td colspan="2">—</td>
                <td colspan="3">—</td>
                <td>—</td>
              </tr>
              <tr valign="top">
                <td>Number of features</td>
                <td colspan="2">—</td>
                <td colspan="2">—</td>
                <td colspan="2">—</td>
                <td colspan="2">—</td>
                <td colspan="3">1.054 (1.034-1.076)</td>
                <td colspan="2">&#60;.001</td>
                <td colspan="3">—</td>
                <td>—</td>
              </tr>
              <tr valign="top">
                <td>Continuous features (%)</td>
                <td colspan="2">—</td>
                <td colspan="2">—</td>
                <td colspan="2">—</td>
                <td colspan="2">—</td>
                <td colspan="3">—</td>
                <td colspan="2">—</td>
                <td colspan="3">0.973 (0.950-0.997)</td>
                <td>.02</td>
              </tr>
              <tr valign="top">
                <td>Core linear features (%)</td>
                <td colspan="2">—</td>
                <td colspan="2">—</td>
                <td colspan="2">—</td>
                <td colspan="2">—</td>
                <td colspan="3">0.988 (0.980-0.996)</td>
                <td colspan="2">.005</td>
                <td colspan="3">—</td>
                <td>—</td>
              </tr>
              <tr valign="top">
                <td>Dataset nonlinearity</td>
                <td colspan="2">3.091 (2.011-4.922)</td>
                <td colspan="2">&#60;.001</td>
                <td colspan="2">12.298 (5.826-28.791),</td>
                <td colspan="2">&#60;.001</td>
                <td colspan="3">—</td>
                <td colspan="2">—</td>
                <td colspan="3">10.209 (4.274-26.569)</td>
                <td>&#60;.001</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table5fn1">
              <p><sup>a</sup>XGB: XGBoost; adjusted pseudo-<italic>R</italic><sup>2</sup>=0.845.</p>
            </fn>
            <fn id="table5fn2">
              <p><sup>b</sup>RF: random forest; adjusted pseudo-<italic>R</italic><sup>2</sup>=0.808.</p>
            </fn>
            <fn id="table5fn3">
              <p><sup>c</sup>LR: logistic regression; adjusted pseudo-<italic>R</italic><sup>2</sup>=0.798.</p>
            </fn>
            <fn id="table5fn4">
              <p><sup>d</sup>NN: neural network; adjusted pseudo-<italic>R</italic><sup>2</sup>=0.665.</p>
            </fn>
            <fn id="table5fn5">
              <p><sup>e</sup>Not available.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <p>For XGB and RF, minority class proportion, separability, and nonlinearity were the top 3 variables selected. For LR, minority class proportion, number of features, and percentage of core features were the top 3 variables. For NN, the top 3 variables were minority class proportion, number of features, and nonlinearity. The direction and magnitude of coefficient estimates from multivariable models were similar to those obtained from univariable models (<xref ref-type="table" rid="table5">Table 5</xref>). Deviance-based <italic>R</italic><sup>2</sup> statistics, adjusted for the number of predictors added, were 0.845 (XGB), 0.808 (RF), 0.798 (LR), and 0.665 (NN; <xref ref-type="table" rid="table5">Table 5</xref>). This indicated that the dataset–level predictors explained a majority (66.5%-84.5%) of the total deviance in the data among all 4 models, although the NN model was weaker than the other 3. <xref rid="figure5" ref-type="fig">Figure 5</xref> shows the predicted sample sizes estimated from each algorithm-specific model at a variety of levels for each predictor. As can be seen, for all 4 classification algorithms, a balanced class ratio (50% cases versus 50% controls) resulted in the lowest predicted sample sizes.</p>
        <fig id="figure5" position="float">
          <label>Figure 5</label>
          <caption>
            <p>Fitted values derived from the 4 final negative binomial regression models for each classification algorithm. Shaded lines represent 95% CI. Imbalance = minority class proportion × 100. Separability = Full–data set AUC × 100. Nonlinearity “low”: &#60;5, “high”: ≥5. LR: Logistic Regression. NN: Neural Networks. RF: Random Forest. XGB: XGBoost.</p>
          </caption>
          <graphic xlink:href="jmir_v26i1e60231_fig5.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
    </sec>
    <sec sec-type="discussion">
      <title>Discussion</title>
      <sec>
        <title>Conclusions</title>
        <p>In this study, we performed a learning curve analysis of 16 datasets over 4 different classification algorithms. From this, we identified the expected samples needed to reach AUCs within 2 points of those measured in the full dataset. We then examined the effects of dataset–level characteristics on expected sample sizes and provided formulas that can be used to predict the necessary sample size in a new dataset. We found that LR required the smallest sample size (median 696, range 204-6798) but performed slightly worse, on average, compared with more complicated algorithms. RF (median 3404, range 250-140,499) and XGB algorithms (median 9960, range 960-65,556) required larger sample sizes, as expected. NNs required the largest sample size (median 12,998, IQR 1824-180,835) and also had the most variability over the 16 datasets. This was an interesting finding, as our implementation of a deep learning approach was a basic architecture including only one hidden layer and 20 nodes. The fact that NNs required the largest median sample size of &#62;12,000 and were the most variable in terms of expected sample size demonstrates that implementation of deep learning methods—especially more complex deep-learning schemes than what we have provided—should probably be reserved for extremely large datasets for optimal performance and adequate discriminative stability. In addition, NNs had a weaker performance than XGB in every dataset. However, it is possible that a more expressive deep learning approach may yield higher AUCs compared with the best-performing ML solutions considered in this study, although to the best of our knowledge, no other studies have provided deep learning results that outperform ML approaches in these specific datasets. These results support current literature that suggests deep learning may not be optimal for tabular data analysis compared with tree-based methods [<xref ref-type="bibr" rid="ref35">35</xref>] when weighing accuracy trade-offs versus computational burden.</p>
        <p>Our results are consistent with Perlich et al [<xref ref-type="bibr" rid="ref12">12</xref>], which showed that LR might be optimal in small samples, but tree-based methods eventually provide the best performance in large datasets. Van der Ploeg et al [<xref ref-type="bibr" rid="ref16">16</xref>] determined that LR required a much lower number of events-per-variable for AUC stability, defined as CV-AUC within 0.01 of the full–dataset performance compared to RF and NN, which required &#62;200 events-per-variable. We can convert our expected sample sizes to events-per-variable by taking the predicted n, multiplying it by the minority class proportion of the dataset, and then dividing it by the number of features. In our study, LR required an average of 11 events-per-variable, XGB: 205, RF: 231, and NN: 342, which supports this notion that modern modeling techniques are “data-hungry” [<xref ref-type="bibr" rid="ref16">16</xref>].</p>
        <p>In summary, this study provides a simple framework for determining sample size in the context of 4 popular ML algorithms. Dataset–level variables that altered expected sample sizes varied by algorithm, but the class imbalance of the outcome, the strength and number of features, and the nonlinearity of the predictors were among the most influential characteristics. Most of these dataset–level characteristics can be reasonably guessed or influenced before the study begins. For example, researchers can examine previous studies in their field of interest to determine a reasonable range for separability and minority class proportion. For minority class proportion, which was a key selected feature for all 4 models, we observed that an optimal class balance (50% cases, 50% controls) led to the lowest predicted sample sizes, with each additional percentage point of balance decreasing the needed n by a multiplier of 0.96-0.98.</p>
        <p>In addition, researchers can use feature engineering to control the quality and overall number of predictors included in their models. As we have determined in this study, a smaller number of strong predictors will generally require less sample size than a large and noisy predictor set, supporting the idea that more features are not always ideal [<xref ref-type="bibr" rid="ref36">36</xref>]. Dataset nonlinearity is less intuitive to guess before data collection. In general, we found that datasets with nonlinearity values of at least 5.0 required approximately 3-12 times the amount of sample to reach stability, depending on the algorithm. However, in this study, 13/16 (81.3%) of the datasets had values under 5.0, which means that high values of dataset nonlinearity may be uncommon. Again, previous studies where both simple (LR) and complex (NN, RF, and XGB) methods are compared can help researchers determine if this value will be high. As a last resort, researchers can simply calculate expected sample sizes for both scenarios (&#60;5.0 and ≥5.0) using the model equations presented in this study and discuss the implications. It is also important to note that the effect of nonlinearity (and other dataset–level characteristics) on estimated sample sizes is diminished when the class imbalance is optimized. This is due to the multiplicative nature of the negative binomial regression models, which is illustrated in <xref rid="figure5" ref-type="fig">Figure 5</xref>. Thus, first and foremost, it is critical that researchers aim to collect a sample with the most balance between cases and controls.</p>
        <p>To our knowledge, no previous study has presented specific formulas for calculating sample size within the context of ML using multiple dataset–level characteristics. Although other studies have provided estimates of the needed sample size (or number of events per variable) to reach performance stability over a variety of classification algorithms, these works used simulation approaches or a limited number of real-life datasets and did not consider multiple specific dataset–level characteristics in calculation of these estimates [<xref ref-type="bibr" rid="ref15">15</xref>,<xref ref-type="bibr" rid="ref16">16</xref>].</p>
      </sec>
      <sec>
        <title>Limitations and Future Work</title>
        <p>One limitation of our study was the relatively small sample size of only 16 datasets to develop our final models. Although this number is a relatively large amount in this area of research—similar learning curve analyses have typically examined less than 10 [<xref ref-type="bibr" rid="ref14">14</xref>,<xref ref-type="bibr" rid="ref16">16</xref>,<xref ref-type="bibr" rid="ref17">17</xref>]—assessment of more would strengthen these models and provide clearer insight into dataset–level effects on expected sample size. However, the fact that we still observed many statistically significant relationships even with this small effective sample size is a strength of the study. In addition, the datasets we examined were all tabular in nature and had relatively low (&#60;50) numbers of features—generalization of the formulas presented in this study may not extrapolate to datasets with larger amounts of features or data arising from medical imaging or nontabular sources. Finally, it is important to note that ML, specifically algorithms like RF and XGB, can still outperform traditional parametric methods even if the sample size is limited (ie, under n=5000) and when hyperparameter tuning is implemented [<xref ref-type="bibr" rid="ref37">37</xref>,<xref ref-type="bibr" rid="ref38">38</xref>]. Therefore, these guidelines should serve as a supplement to be used in the first stage of predictive modeling, giving a general idea of how much sample is expected to reach a point of “diminishing returns,” where large amounts of additional data will only increase the AUC marginally.</p>
        <p>Future research in this area could examine different outcome types, such as regression, multiclass, or survival end points—or different performance metrics, such as area under the precision-recall curve or Brier score for probability calibration [<xref ref-type="bibr" rid="ref10">10</xref>]. In addition, a more in-depth examination of XGB, RF, and NN hyperparameters would be impactful, as all of the equations developed in this study considered only the default hyperparameter values, which could limit the generalizability of the results. However, in practice, it would be extremely difficult to guess plausible values of hyperparameters before data collection, so examination of different configurations would mostly be educational in nature and impractical to incorporate in sample size equations before any data collection. Finally, stacked ML methods [<xref ref-type="bibr" rid="ref39">39</xref>], or different gradient-boosted tree algorithms such as CatBoost [<xref ref-type="bibr" rid="ref40">40</xref>] or LightGBM [<xref ref-type="bibr" rid="ref41">41</xref>] could be investigated.</p>
      </sec>
    </sec>
  </body>
  <back>
    <app-group>
      <supplementary-material id="app1">
        <label>Multimedia Appendix 1</label>
        <p>Additional information: data processing steps and additional tables and figures.</p>
        <media xlink:href="jmir_v26i1e60231_app1.docx" xlink:title="DOCX File , 35 KB"/>
      </supplementary-material>
    </app-group>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">AUC</term>
          <def>
            <p>area under the curve</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb2">CV</term>
          <def>
            <p>cross-validation</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb3">CV-AUC</term>
          <def>
            <p>cross-validated area under the curve</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb4">LASSO</term>
          <def>
            <p>least absolute shrinkage and selection operator</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb5">LR</term>
          <def>
            <p>logistic regression</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb6">MCAR</term>
          <def>
            <p>missing completely at random</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb7">ML</term>
          <def>
            <p>machine learning</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb8">MLP</term>
          <def>
            <p>multilayer perceptron</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb9">NN</term>
          <def>
            <p>neural network</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb10">RF</term>
          <def>
            <p>random forest</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb11">XGB</term>
          <def>
            <p>XGBoost</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <notes>
      <sec>
        <title>Data Availability</title>
        <p>The datasets generated during and/or analyzed during this study are publicly available and can be accessed using links found in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>. In addition, the datasets generated during and/or analyzed during this study are available from the corresponding author on reasonable request. An RShiny app is publicly available where researchers can calculate the expected sample size based on our formulas with user-input dataset-level characteristics and can be found at [<xref ref-type="bibr" rid="ref42">42</xref>].</p>
      </sec>
    </notes>
    <fn-group>
      <fn fn-type="conflict">
        <p>None declared.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Jiang</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Jiang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Zhi</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Dong</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Ma</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Dong</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Shen</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>Y</given-names>
            </name>
          </person-group>
          <article-title>Artificial intelligence in healthcare: past, present and future</article-title>
          <source>Stroke Vasc Neurol</source>
          <year>2017</year>
          <volume>2</volume>
          <issue>4</issue>
          <fpage>230</fpage>
          <lpage>243</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://svn.bmj.com/lookup/pmidlookup?view=long&#38;pmid=29507784"/>
          </comment>
          <pub-id pub-id-type="doi">10.1136/svn-2017-000101</pub-id>
          <pub-id pub-id-type="medline">29507784</pub-id>
          <pub-id pub-id-type="pii">svn-2017-000101</pub-id>
          <pub-id pub-id-type="pmcid">PMC5829945</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Cho</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>She</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>De Marchi</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>El-Zaatari</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Barnes</surname>
              <given-names>EL</given-names>
            </name>
            <name name-style="western">
              <surname>Kahkoska</surname>
              <given-names>AR</given-names>
            </name>
            <name name-style="western">
              <surname>Kosorok</surname>
              <given-names>MR</given-names>
            </name>
            <name name-style="western">
              <surname>Virkud</surname>
              <given-names>AV</given-names>
            </name>
          </person-group>
          <article-title>Machine learning and health science research: tutorial</article-title>
          <source>J Med Internet Res</source>
          <year>2024</year>
          <volume>26</volume>
          <fpage>e50890</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.jmir.org/2024//e50890/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/50890</pub-id>
          <pub-id pub-id-type="medline">38289657</pub-id>
          <pub-id pub-id-type="pii">v26i1e50890</pub-id>
          <pub-id pub-id-type="pmcid">PMC10865203</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Bonaccorso</surname>
              <given-names>G</given-names>
            </name>
          </person-group>
          <source>Machine Learning Algorithms : Popular Algorithms for Data Science and Machine Learning</source>
          <year>2018</year>
          <publisher-loc>Birmingham</publisher-loc>
          <publisher-name>Packt Publishing</publisher-name>
        </nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Dorey</surname>
              <given-names>FJ</given-names>
            </name>
          </person-group>
          <article-title>Statistics in brief: statistical power: what is it and when should it be used?</article-title>
          <source>Clin Orthop Relat Res</source>
          <year>2011</year>
          <volume>469</volume>
          <issue>2</issue>
          <fpage>619</fpage>
          <lpage>620</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/20585913"/>
          </comment>
          <pub-id pub-id-type="doi">10.1007/s11999-010-1435-0</pub-id>
          <pub-id pub-id-type="medline">20585913</pub-id>
          <pub-id pub-id-type="pmcid">PMC3018227</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Devijver</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Kittler</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <source>Pattern Recognition: A Statistical Approach</source>
          <year>1982</year>
          <publisher-loc>New Jersey, United States</publisher-loc>
          <publisher-name>Prentice-Hall</publisher-name>
        </nlm-citation>
      </ref>
      <ref id="ref6">
        <label>6</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hastie</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Tibshirani</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Friedman</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <source>The Elements of Statistical Learning: Data Mining, Inference, and Prediction</source>
          <year>2009</year>
          <publisher-loc>New York</publisher-loc>
          <publisher-name>Springer</publisher-name>
        </nlm-citation>
      </ref>
      <ref id="ref7">
        <label>7</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>He</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Garcia</surname>
              <given-names>EA</given-names>
            </name>
          </person-group>
          <article-title>Learning from imbalanced data</article-title>
          <source>IEEE Trans. Knowl. Data Eng</source>
          <year>2009</year>
          <volume>21</volume>
          <issue>9</issue>
          <fpage>1263</fpage>
          <lpage>1284</lpage>
          <pub-id pub-id-type="doi">10.1109/tkde.2008.239</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref8">
        <label>8</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hanley</surname>
              <given-names>JA</given-names>
            </name>
            <name name-style="western">
              <surname>McNeil</surname>
              <given-names>BJ</given-names>
            </name>
          </person-group>
          <article-title>The meaning and use of the area under a receiver operating characteristic (ROC) curve</article-title>
          <source>Radiology</source>
          <year>1982</year>
          <volume>143</volume>
          <issue>1</issue>
          <fpage>29</fpage>
          <lpage>36</lpage>
          <pub-id pub-id-type="doi">10.1148/radiology.143.1.7063747</pub-id>
          <pub-id pub-id-type="medline">7063747</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref9">
        <label>9</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Bradley</surname>
              <given-names>AP</given-names>
            </name>
          </person-group>
          <article-title>The use of the area under the ROC curve in the evaluation of machine learning algorithms</article-title>
          <source>Pattern Recognition</source>
          <year>1997</year>
          <volume>30</volume>
          <issue>7</issue>
          <fpage>1145</fpage>
          <lpage>1159</lpage>
          <pub-id pub-id-type="doi">10.1016/s0031-3203(96)00142-2</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref10">
        <label>10</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Huang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Macheret</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Gabriel</surname>
              <given-names>RA</given-names>
            </name>
            <name name-style="western">
              <surname>Ohno-Machado</surname>
              <given-names>L</given-names>
            </name>
          </person-group>
          <article-title>A tutorial on calibration measurements and calibration models for clinical prediction models</article-title>
          <source>J Am Med Inform Assoc</source>
          <year>2020</year>
          <volume>27</volume>
          <issue>4</issue>
          <fpage>621</fpage>
          <lpage>633</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/32106284"/>
          </comment>
          <pub-id pub-id-type="doi">10.1093/jamia/ocz228</pub-id>
          <pub-id pub-id-type="medline">32106284</pub-id>
          <pub-id pub-id-type="pii">5762806</pub-id>
          <pub-id pub-id-type="pmcid">PMC7075534</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref11">
        <label>11</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Webb</surname>
              <given-names>GI</given-names>
            </name>
            <name name-style="western">
              <surname>Sammut</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Perlich</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Horváth</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Wrobel</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Korb</surname>
              <given-names>KB</given-names>
            </name>
          </person-group>
          <article-title>Learning curves in machine learning</article-title>
          <source>Encyclopedia of Machine Learning</source>
          <year>2011</year>
          <fpage>577</fpage>
          <lpage>580</lpage>
          <pub-id pub-id-type="doi">10.1007/978-0-387-30164-8_452</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref12">
        <label>12</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Perlich</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Provost</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Simonoff</surname>
              <given-names>JS</given-names>
            </name>
          </person-group>
          <article-title>Tree induction vs. logistic regression: a learning-curve analysis</article-title>
          <source>Journal of Machine Learning Research</source>
          <year>2003</year>
          <fpage>211</fpage>
          <lpage>255</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.jmlr.org/papers/volume4/perlich03a/perlich03a.pdf"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref13">
        <label>13</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Mukherjee</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Tamayo</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Rogers</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Rifkin</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Engle</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Campbell</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Golub</surname>
              <given-names>TR</given-names>
            </name>
            <name name-style="western">
              <surname>Mesirov</surname>
              <given-names>JP</given-names>
            </name>
          </person-group>
          <article-title>Estimating dataset size requirements for classifying DNA microarray data</article-title>
          <source>J Comput Biol</source>
          <year>2003</year>
          <volume>10</volume>
          <issue>2</issue>
          <fpage>119</fpage>
          <lpage>142</lpage>
          <pub-id pub-id-type="doi">10.1089/106652703321825928</pub-id>
          <pub-id pub-id-type="medline">12804087</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref14">
        <label>14</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Figueroa</surname>
              <given-names>RL</given-names>
            </name>
            <name name-style="western">
              <surname>Zeng-Treitler</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Kandula</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Ngo</surname>
              <given-names>LH</given-names>
            </name>
          </person-group>
          <article-title>Predicting sample size required for classification performance</article-title>
          <source>BMC Med Inform Decis Mak</source>
          <year>2012</year>
          <volume>12</volume>
          <issue>1</issue>
          <fpage>8</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://bmcmedinformdecismak.biomedcentral.com/articles/10.1186/1472-6947-12-8"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/1472-6947-12-8</pub-id>
          <pub-id pub-id-type="medline">22336388</pub-id>
          <pub-id pub-id-type="pii">1472-6947-12-8</pub-id>
          <pub-id pub-id-type="pmcid">PMC3307431</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref15">
        <label>15</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Provost</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Jensen</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Oates</surname>
              <given-names>T</given-names>
            </name>
          </person-group>
          <source>Efficient progressive sampling</source>
          <access-date>2024-11-19</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://dl.acm.org/doi/pdf/10.1145/312129.312188">https://dl.acm.org/doi/pdf/10.1145/312129.312188</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref16">
        <label>16</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>van der Ploeg</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Austin</surname>
              <given-names>PC</given-names>
            </name>
            <name name-style="western">
              <surname>Steyerberg</surname>
              <given-names>EW</given-names>
            </name>
          </person-group>
          <article-title>Modern modelling techniques are data hungry: a simulation study for predicting dichotomous endpoints</article-title>
          <source>BMC Med Res Methodol</source>
          <year>2014</year>
          <volume>14</volume>
          <fpage>137</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://bmcmedresmethodol.biomedcentral.com/articles/10.1186/1471-2288-14-137"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/1471-2288-14-137</pub-id>
          <pub-id pub-id-type="medline">25532820</pub-id>
          <pub-id pub-id-type="pii">1471-2288-14-137</pub-id>
          <pub-id pub-id-type="pmcid">PMC4289553</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref17">
        <label>17</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Richter</surname>
              <given-names>AN</given-names>
            </name>
            <name name-style="western">
              <surname>Khoshgoftaar</surname>
              <given-names>TM</given-names>
            </name>
          </person-group>
          <article-title>Sample size determination for biomedical big data with limited labels</article-title>
          <source>Netw Model Anal Health Inform Bioinforma</source>
          <year>2020</year>
          <volume>9</volume>
          <issue>1</issue>
          <pub-id pub-id-type="doi">10.1007/s13721-020-0218-0</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref18">
        <label>18</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Micheel</surname>
              <given-names>CM</given-names>
            </name>
            <name name-style="western">
              <surname>Nass</surname>
              <given-names>SJ</given-names>
            </name>
            <name name-style="western">
              <surname>Omenn</surname>
              <given-names>GS</given-names>
            </name>
          </person-group>
          <source>Evolution of Translational Omics Lessons Learned and the Path Forward</source>
          <year>2012</year>
          <publisher-loc>Washington, DC</publisher-loc>
          <publisher-name>National Academies Press</publisher-name>
        </nlm-citation>
      </ref>
      <ref id="ref19">
        <label>19</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Rijn</surname>
              <given-names>JN</given-names>
            </name>
            <name name-style="western">
              <surname>Holmes</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Bernhard</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Vanschoren</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Algorithm selection on data streams</article-title>
          <source>Lecture Notes in Computer Science</source>
          <year>2014</year>
          <fpage>325</fpage>
          <lpage>336</lpage>
          <pub-id pub-id-type="doi">10.1007/978-3-319-11812-3_28</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref20">
        <label>20</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Little</surname>
              <given-names>RJA</given-names>
            </name>
            <name name-style="western">
              <surname>Rubin</surname>
              <given-names>DB</given-names>
            </name>
          </person-group>
          <source>Statistical Analysis with Missing Data</source>
          <year>2020</year>
          <publisher-loc>Hoboken, Nj</publisher-loc>
          <publisher-name>John Wiley &#38; Sons, Inc</publisher-name>
        </nlm-citation>
      </ref>
      <ref id="ref21">
        <label>21</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Cramer</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>The origins of logistic regression</article-title>
          <source>SSRN Electronic Journal</source>
          <year>2003</year>
          <fpage>16</fpage>
          <pub-id pub-id-type="doi">10.2139/ssrn.360300</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref22">
        <label>22</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Guestrin</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>XGBoost: a scalable tree boosting system</article-title>
          <year>2016</year>
          <conf-name>Proceedings of the 22nd ACM SIGKDD International Conference on Knowledge Discovery and Data Mining - KDD-16</conf-name>
          <conf-date>August 13-17, 2016</conf-date>
          <conf-loc>San Francisco, California, USA</conf-loc>
          <fpage>785</fpage>
          <lpage>794</lpage>
          <pub-id pub-id-type="doi">10.1145/2939672.2939785</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref23">
        <label>23</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wright</surname>
              <given-names>MN</given-names>
            </name>
            <name name-style="western">
              <surname>Ziegler</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Ranger: a fast implementation of random forests for high dimensional data in C++ and R</article-title>
          <source>J. Stat. Soft</source>
          <year>2017</year>
          <volume>77</volume>
          <issue>1</issue>
          <fpage>1</fpage>
          <lpage>17</lpage>
          <pub-id pub-id-type="doi">10.18637/jss.v077.i01</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref24">
        <label>24</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Mall</surname>
              <given-names>PK</given-names>
            </name>
            <name name-style="western">
              <surname>Singh</surname>
              <given-names>PK</given-names>
            </name>
            <name name-style="western">
              <surname>Srivastav</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Narayan</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Paprzycki</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Jaworska</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Ganzha</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>A comprehensive review of deep neural networks for medical image processing: recent developments and future opportunities</article-title>
          <source>Healthcare Analytics</source>
          <year>2023</year>
          <volume>4</volume>
          <fpage>100216</fpage>
          <pub-id pub-id-type="doi">10.1016/j.health.2023.100216</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref25">
        <label>25</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Nwanosike</surname>
              <given-names>EM</given-names>
            </name>
            <name name-style="western">
              <surname>Conway</surname>
              <given-names>BR</given-names>
            </name>
            <name name-style="western">
              <surname>Merchant</surname>
              <given-names>HA</given-names>
            </name>
            <name name-style="western">
              <surname>Hasan</surname>
              <given-names>SS</given-names>
            </name>
          </person-group>
          <article-title>Potential applications and performance of machine learning techniques and algorithms in clinical practice: a systematic review</article-title>
          <source>Int J Med Inform</source>
          <year>2022</year>
          <volume>159</volume>
          <fpage>104679</fpage>
          <pub-id pub-id-type="doi">10.1016/j.ijmedinf.2021.104679</pub-id>
          <pub-id pub-id-type="medline">34990939</pub-id>
          <pub-id pub-id-type="pii">S1386-5056(21)00305-1</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref26">
        <label>26</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <collab>Foundation for Open Access Statistics</collab>
          </person-group>
          <source>Fast scalable R with H20</source>
          <year>2015</year>
          <access-date>2024-11-25</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://h2o.ai/">https://h2o.ai/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref27">
        <label>27</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Berrar</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>Cross-validation</article-title>
          <source>Encyclopedia of Bioinformatics and Computational Biology</source>
          <year>2019</year>
          <volume>1</volume>
          <fpage>542</fpage>
          <lpage>545</lpage>
          <pub-id pub-id-type="doi">10.1016/b978-0-12-809633-8.20349-x</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref28">
        <label>28</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Bates</surname>
              <given-names>DM</given-names>
            </name>
            <name name-style="western">
              <surname>Watts</surname>
              <given-names>DG</given-names>
            </name>
          </person-group>
          <source>Nonlinear Regression Analysis and Its Applications</source>
          <year>2007</year>
          <publisher-loc>Hoboken, New Jersey</publisher-loc>
          <publisher-name>Wiley-Interscience</publisher-name>
        </nlm-citation>
      </ref>
      <ref id="ref29">
        <label>29</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kutner</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Nachtsheim</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Neter</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <source>Applied Linear Regression Models</source>
          <year>2004</year>
          <publisher-loc>London</publisher-loc>
          <publisher-name>Mcgraw-Hill Education - Europe</publisher-name>
        </nlm-citation>
      </ref>
      <ref id="ref30">
        <label>30</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Friedman</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Hastie</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Tibshirani</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>Regularization paths for generalized linear models via coordinate descent</article-title>
          <source>J Stat Softw</source>
          <year>2010</year>
          <volume>33</volume>
          <issue>1</issue>
          <fpage>1</fpage>
          <lpage>22</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/20808728"/>
          </comment>
          <pub-id pub-id-type="medline">20808728</pub-id>
          <pub-id pub-id-type="pmcid">PMC2929880</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref31">
        <label>31</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hilbe</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <source>Negative Binomial Regression</source>
          <year>2007</year>
          <publisher-loc>Cambridge, England</publisher-loc>
          <publisher-name>Cambridge University Press</publisher-name>
        </nlm-citation>
      </ref>
      <ref id="ref32">
        <label>32</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Bozdogan</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>Model selection and Akaike's Information Criterion (AIC): the general theory and its analytical extensions</article-title>
          <source>Psychometrika</source>
          <year>1987</year>
          <volume>52</volume>
          <issue>3</issue>
          <fpage>345</fpage>
          <lpage>370</lpage>
          <pub-id pub-id-type="doi">10.1007/bf02294361</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref33">
        <label>33</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Vanegas</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Rondón</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Paula</surname>
              <given-names>G</given-names>
            </name>
          </person-group>
          <source>_glmtoolbox: Set of tools to data analysis using generalized linear models_</source>
          <year>2024</year>
          <access-date>2024-11-19</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://CRAN.R-project.org/package=glmtoolbox">https://CRAN.R-project.org/package=glmtoolbox</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref34">
        <label>34</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Veall</surname>
              <given-names>MR</given-names>
            </name>
            <name name-style="western">
              <surname>Zimmermann</surname>
              <given-names>KF</given-names>
            </name>
          </person-group>
          <article-title>Pseudo-R2 measures for some common limited dependent variable models</article-title>
          <source>Journal of Economic Surveys</source>
          <year>2006</year>
          <volume>10</volume>
          <issue>3</issue>
          <fpage>241</fpage>
          <lpage>259</lpage>
          <pub-id pub-id-type="doi">10.1111/j.1467-6419.1996.tb00013.x</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref35">
        <label>35</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Shwartz-Ziv</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Armon</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Tabular data: deep learning is not all you need</article-title>
          <source>arXiv</source>
          <year>2021</year>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/2106.03253"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.inffus.2021.11.011</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref36">
        <label>36</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Berisha</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Krantsevich</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Hahn</surname>
              <given-names>PR</given-names>
            </name>
            <name name-style="western">
              <surname>Hahn</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Dasarathy</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Turaga</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Liss</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Digital medicine and the curse of dimensionality</article-title>
          <source>NPJ Digit Med</source>
          <year>2021</year>
          <volume>4</volume>
          <issue>1</issue>
          <fpage>153</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1038/s41746-021-00521-5"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/s41746-021-00521-5</pub-id>
          <pub-id pub-id-type="medline">34711924</pub-id>
          <pub-id pub-id-type="pii">10.1038/s41746-021-00521-5</pub-id>
          <pub-id pub-id-type="pmcid">PMC8553745</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref37">
        <label>37</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Papini</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Norman</surname>
              <given-names>SB</given-names>
            </name>
            <name name-style="western">
              <surname>Campbell-Sills</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Sun</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>He</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Kessler</surname>
              <given-names>RC</given-names>
            </name>
            <name name-style="western">
              <surname>Ursano</surname>
              <given-names>RJ</given-names>
            </name>
            <name name-style="western">
              <surname>Jain</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Stein</surname>
              <given-names>MB</given-names>
            </name>
          </person-group>
          <article-title>Development and validation of a machine learning prediction model of posttraumatic stress disorder after military deployment</article-title>
          <source>JAMA Netw Open</source>
          <year>2023</year>
          <volume>6</volume>
          <issue>6</issue>
          <fpage>e2321273</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/37389870"/>
          </comment>
          <pub-id pub-id-type="doi">10.1001/jamanetworkopen.2023.21273</pub-id>
          <pub-id pub-id-type="medline">37389870</pub-id>
          <pub-id pub-id-type="pii">2806713</pub-id>
          <pub-id pub-id-type="pmcid">PMC10314304</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref38">
        <label>38</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Abe</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Inaji</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Hase</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Takahashi</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Sakai</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Ayabe</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Tanaka</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Otomo</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Maehara</surname>
              <given-names>T</given-names>
            </name>
          </person-group>
          <article-title>A prehospital triage system to detect traumatic intracranial hemorrhage using machine learning algorithms</article-title>
          <source>JAMA Netw Open</source>
          <year>2022</year>
          <volume>5</volume>
          <issue>6</issue>
          <fpage>e2216393</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/35687335"/>
          </comment>
          <pub-id pub-id-type="doi">10.1001/jamanetworkopen.2022.16393</pub-id>
          <pub-id pub-id-type="medline">35687335</pub-id>
          <pub-id pub-id-type="pii">2793225</pub-id>
          <pub-id pub-id-type="pmcid">PMC9187955</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref39">
        <label>39</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>van der Laan</surname>
              <given-names>MJ</given-names>
            </name>
            <name name-style="western">
              <surname>Polley</surname>
              <given-names>EC</given-names>
            </name>
            <name name-style="western">
              <surname>Hubbard</surname>
              <given-names>AE</given-names>
            </name>
          </person-group>
          <article-title>Super learner</article-title>
          <source>Stat Appl Genet Mol Biol</source>
          <year>2007</year>
          <volume>6</volume>
          <fpage>Article25</fpage>
          <pub-id pub-id-type="doi">10.2202/1544-6115.1309</pub-id>
          <pub-id pub-id-type="medline">17910531</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref40">
        <label>40</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Liudmila</surname>
              <given-names>OP</given-names>
            </name>
            <name name-style="western">
              <surname>Gleb</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Aleksandr</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Anna</surname>
              <given-names>VD</given-names>
            </name>
            <name name-style="western">
              <surname>Andrey</surname>
              <given-names>G</given-names>
            </name>
          </person-group>
          <article-title>CatBoost: unbiased boosting with categorical features</article-title>
          <source>arXiv (Cornell University)</source>
          <year>2021</year>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/1706.09516"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref41">
        <label>41</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ke</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Meng</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Finley</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Ma</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Ye</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>T</given-names>
            </name>
          </person-group>
          <article-title>LightGBM: a highly efficient gradient boosting decision tree</article-title>
          <source>Advances in Neural Information Processing Systems</source>
          <year>2017</year>
          <volume>30</volume>
          <fpage>3146</fpage>
          <lpage>3154</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://proceedings.neurips.cc/paper_files/paper/2017/file/6449f44a102fde848669bdd9eb6b76fa-Paper.pdf"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref42">
        <label>42</label>
        <nlm-citation citation-type="web">
          <source>RShiny app</source>
          <access-date>2024-11-25</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://silveys.shinyapps.io/shiny_app_aim_1/">https://silveys.shinyapps.io/shiny_app_aim_1/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
