<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article" dtd-version="2.0">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JMIR</journal-id>
      <journal-id journal-id-type="nlm-ta">J Med Internet Res</journal-id>
      <journal-title>Journal of Medical Internet Research</journal-title>
      <issn pub-type="epub">1438-8871</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
      <article-id pub-id-type="publisher-id">v26i1e46936</article-id>
      <article-id pub-id-type="pmid">39186324</article-id>
      <article-id pub-id-type="doi">10.2196/46936</article-id>
      <article-categories>
        <subj-group subj-group-type="heading">
          <subject>Original Paper</subject>
        </subj-group>
        <subj-group subj-group-type="article-type">
          <subject>Original Paper</subject>
        </subj-group>
      </article-categories>
      <title-group>
        <article-title>Sex-Based Performance Disparities in Machine Learning Algorithms for Cardiac Disease Prediction: Exploratory Study</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="editor">
          <name>
            <surname>Mavragani</surname>
            <given-names>Amaryllis</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Zeng</surname>
            <given-names>Juntong</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Antani</surname>
            <given-names>Sameer</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>van der Velde</surname>
            <given-names>Enno</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Guo</surname>
            <given-names>Lei</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib id="contrib1" contrib-type="author" corresp="yes">
          <name name-style="western">
            <surname>Straw</surname>
            <given-names>Isabel</given-names>
          </name>
          <degrees>BMedSci, BMBS, MPH, MRES</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <address>
            <institution>University College London</institution>
            <addr-line>222 Euston Road</addr-line>
            <addr-line>London, NW1 2DA</addr-line>
            <country>United Kingdom</country>
            <phone>44 020 3549 5969</phone>
            <email>isabelstraw@doctors.org.uk</email>
          </address>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-0003-3550</ext-link>
        </contrib>
        <contrib id="contrib2" contrib-type="author">
          <name name-style="western">
            <surname>Rees</surname>
            <given-names>Geraint</given-names>
          </name>
          <degrees>BMBCh, PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-9623-7007</ext-link>
        </contrib>
        <contrib id="contrib3" contrib-type="author">
          <name name-style="western">
            <surname>Nachev</surname>
            <given-names>Parashkev</given-names>
          </name>
          <degrees>BMBCh, PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-2718-4423</ext-link>
        </contrib>
      </contrib-group>
      <aff id="aff1">
        <label>1</label>
        <institution>University College London</institution>
        <addr-line>London</addr-line>
        <country>United Kingdom</country>
      </aff>
      <author-notes>
        <corresp>Corresponding Author: Isabel Straw <email>isabelstraw@doctors.org.uk</email></corresp>
      </author-notes>
      <pub-date pub-type="collection">
        <year>2024</year>
      </pub-date>
      <pub-date pub-type="epub">
        <day>26</day>
        <month>8</month>
        <year>2024</year>
      </pub-date>
      <volume>26</volume>
      <elocation-id>e46936</elocation-id>
      <history>
        <date date-type="received">
          <day>3</day>
          <month>3</month>
          <year>2023</year>
        </date>
        <date date-type="rev-request">
          <day>16</day>
          <month>6</month>
          <year>2023</year>
        </date>
        <date date-type="rev-recd">
          <day>13</day>
          <month>10</month>
          <year>2023</year>
        </date>
        <date date-type="accepted">
          <day>4</day>
          <month>5</month>
          <year>2024</year>
        </date>
      </history>
      <copyright-statement>©Isabel Straw, Geraint Rees, Parashkev Nachev. Originally published in the Journal of Medical Internet Research (https://www.jmir.org), 26.08.2024.</copyright-statement>
      <copyright-year>2024</copyright-year>
      <license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
        <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (https://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in the Journal of Medical Internet Research (ISSN 1438-8871), is properly cited. The complete bibliographic information, a link to the original publication on https://www.jmir.org/, as well as this copyright and license information must be included.</p>
      </license>
      <self-uri xlink:href="https://www.jmir.org/2024/1/e46936" xlink:type="simple"/>
      <abstract>
        <sec sec-type="background">
          <title>Background</title>
          <p>The presence of bias in artificial intelligence has garnered increased attention, with inequities in algorithmic performance being exposed across the fields of criminal justice, education, and welfare services. In health care, the inequitable performance of algorithms across demographic groups may widen health inequalities.</p>
        </sec>
        <sec sec-type="objective">
          <title>Objective</title>
          <p>Here, we identify and characterize bias in cardiology algorithms, looking specifically at algorithms used in the management of heart failure.</p>
        </sec>
        <sec sec-type="methods">
          <title>Methods</title>
          <p>Stage 1 involved a literature search of PubMed and Web of Science for key terms relating to cardiac machine learning (ML) algorithms. Papers that built ML models to predict cardiac disease were evaluated for their focus on demographic bias in model performance, and open-source data sets were retained for our investigation. Two open-source data sets were identified: (1) the University of California Irvine Heart Failure data set and (2) the University of California Irvine Coronary Artery Disease data set. We reproduced existing algorithms that have been reported for these data sets, tested them for sex biases in algorithm performance, and assessed a range of remediation techniques for their efficacy in reducing inequities. Particular attention was paid to the false negative rate (FNR), due to the clinical significance of underdiagnosis and missed opportunities for treatment.</p>
        </sec>
        <sec sec-type="results">
          <title>Results</title>
          <p>In stage 1, our literature search returned 127 papers, with 60 meeting the criteria for a full review and only 3 papers highlighting sex differences in algorithm performance. In the papers that reported sex, there was a consistent underrepresentation of female patients in the data sets. No papers investigated racial or ethnic differences. In stage 2, we reproduced algorithms reported in the literature, achieving mean accuracies of 84.24% (SD 3.51%) for data set 1 and 85.72% (SD 1.75%) for data set 2 (random forest models). For data set 1, the FNR was significantly higher for female patients in 13 out of 16 experiments, meeting the threshold of statistical significance (–17.81% to –3.37%; <italic>P</italic>&#60;.05). A smaller disparity in the false positive rate was significant for male patients in 13 out of 16 experiments (–0.48% to +9.77%; <italic>P</italic>&#60;.05). We observed an overprediction of disease for male patients (higher false positive rate) and an underprediction of disease for female patients (higher FNR). Sex differences in feature importance suggest that feature selection needs to be demographically tailored.</p>
        </sec>
        <sec sec-type="conclusions">
          <title>Conclusions</title>
          <p>Our research exposes a significant gap in cardiac ML research, highlighting that the underperformance of algorithms for female patients has been overlooked in the published literature. Our study quantifies sex disparities in algorithmic performance and explores several sources of bias. We found an underrepresentation of female patients in the data sets used to train algorithms, identified sex biases in model error rates, and demonstrated that a series of remediation techniques were unable to address the inequities present.</p>
        </sec>
      </abstract>
      <kwd-group>
        <kwd>artificial intelligence</kwd>
        <kwd>machine learning</kwd>
        <kwd>cardiology</kwd>
        <kwd>health care</kwd>
        <kwd>health equity</kwd>
        <kwd>medicine</kwd>
        <kwd>cardiac</kwd>
        <kwd>quantitative evaluation</kwd>
        <kwd>inequality</kwd>
        <kwd>cardiac disease</kwd>
        <kwd>performance</kwd>
        <kwd>sex</kwd>
        <kwd>management</kwd>
        <kwd>heart failure</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <sec sec-type="introduction">
      <title>Introduction</title>
      <sec>
        <title>Background</title>
        <p>Artificial intelligence (AI) has been proposed as an effective solution to many health care challenges and depends on the construction of machine learning (ML) algorithms from health care data. Recent research has drawn attention to the possibility that algorithms may exhibit bias when applied to different demographic groups [<xref ref-type="bibr" rid="ref1">1</xref>-<xref ref-type="bibr" rid="ref6">6</xref>]. Such biases may widen health inequalities and negatively impact marginalized patients, such as female patients, minoritized racial and ethnic groups, and other neglected subpopulations [<xref ref-type="bibr" rid="ref1">1</xref>-<xref ref-type="bibr" rid="ref7">7</xref>].</p>
        <p>Over the past 5 years, an increasing number of studies have quantified disparities in algorithmic performance for underserved populations [<xref ref-type="bibr" rid="ref2">2</xref>-<xref ref-type="bibr" rid="ref7">7</xref>]. Daneshjou and colleagues [<xref ref-type="bibr" rid="ref2">2</xref>] demonstrated that state-of-the-art dermatology algorithms tend to perform worse on darker skin tones; Seyyed-Kalantari and colleagues [<xref ref-type="bibr" rid="ref3">3</xref>] exposed biases in radiology algorithms; and Thompson and colleagues [<xref ref-type="bibr" rid="ref4">4</xref>] reported increased false negative errors when classifying opioid misuse disorder for Black patients compared to White patients. Beyond specific diagnoses, researchers have demonstrated that infrastructural AI systems used in hospital settings can be subject to referral bias, demonstrated by Obermeyer and colleagues [<xref ref-type="bibr" rid="ref5">5</xref>] who highlighted a hospital treatment allocation algorithm that overlooked the health needs of Black patients. Yet despite the increasing number of papers describing this issue, most of the current uses of biomedical AI technologies do not account for the problem of bias [<xref ref-type="bibr" rid="ref5">5</xref>-<xref ref-type="bibr" rid="ref8">8</xref>]. Here, we evaluate algorithmic inequity in ML algorithms used for predicting cardiac disease, focusing on heart failure (HF).</p>
      </sec>
      <sec>
        <title>ML for HF</title>
        <p>HF is a clinical syndrome in which the heart is unable to maintain a cardiac output adequate to meet the metabolic demands of the body [<xref ref-type="bibr" rid="ref9">9</xref>]. Traditionally, algorithmic tools capable of identifying at-risk patients have played a key role in informing decisions on HF management and end-of-life care [<xref ref-type="bibr" rid="ref10">10</xref>-<xref ref-type="bibr" rid="ref12">12</xref>]. In recent years, ML algorithms that leverage biochemical data have been proposed as a superior alternative to traditional statistical models for identifying at-risk patients with HF [<xref ref-type="bibr" rid="ref13">13</xref>]. A range of ML techniques outperforms traditional risk scores in forecasting HF-related events [<xref ref-type="bibr" rid="ref13">13</xref>]. Yet given that existing medical research has described sex differences in both the presentation and management of HF, algorithms trained on existing data may perform differently for male versus female patients [<xref ref-type="bibr" rid="ref14">14</xref>,<xref ref-type="bibr" rid="ref15">15</xref>].</p>
      </sec>
      <sec>
        <title>Sex Differences in HF</title>
        <p>HF presents differently in female patients compared with male patients [<xref ref-type="bibr" rid="ref14">14</xref>]. Female patients experience a wider range of symptoms, including higher fluid overload and lower health-related quality of life [<xref ref-type="bibr" rid="ref14">14</xref>,<xref ref-type="bibr" rid="ref15">15</xref>]. Moreover, female patients who present with HF are on average older, sustain a higher ejection fraction (EF) throughout later stages of the disease, and have a lower incidence of previous ischemic heart disease [<xref ref-type="bibr" rid="ref15">15</xref>]. Furthermore, the biochemical tests used to detect cardiac disease have been demonstrated to perform less well for female patients [<xref ref-type="bibr" rid="ref16">16</xref>]. Troponin is 1 key biomarker used to predict disease, which has been demonstrated to be less sensitive in female patients [<xref ref-type="bibr" rid="ref16">16</xref>]. Standard troponin criteria fail to detect 1 out of 5 acute myocardial infarcts occurring in female patients [<xref ref-type="bibr" rid="ref16">16</xref>]. Historically, the neglect of sex differences in cardiac pathophysiology has disadvantaged female patients, and if not considered during ML development, these inequities may manifest in the novel algorithms being integrated into cardiac care [<xref ref-type="bibr" rid="ref14">14</xref>-<xref ref-type="bibr" rid="ref19">19</xref>].</p>
        <p>In our research, we scope the published literature reporting algorithms that predict HF and investigate whether existing papers give attention to bias in ML algorithms. Furthermore, we examine the data sets of existing models for demographic representation, evaluate demographic inequities in algorithmic performance, and assess the efficacy of a series of bias-mitigation techniques.</p>
      </sec>
    </sec>
    <sec sec-type="methods">
      <title>Methods</title>
      <sec>
        <title>Study Design</title>
        <p>Our analysis consists of two stages: (1) a literature review of papers describing ML models used to predict HF and (2) a quantitative analysis of identified models, evaluating inequities in algorithm performance. The flowchart in <xref rid="figure1" ref-type="fig">Figure 1</xref> provides an overview of our approach.</p>
        <fig id="figure1" position="float">
          <label>Figure 1</label>
          <caption>
            <p>A flowchart detailing the steps of our methodology, including (1) the initial literature search and qualitative evaluation of identified studies and (2) the identification of data sets and interrogation of algorithms for demographic bias. FAGTB: Fair Adversarial Gradient Tree Boosting; HF: heart failure; ML: machine learning; UCI: University of California Irvine.</p>
          </caption>
          <graphic xlink:href="jmir_v26i1e46936_fig1.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Stage 1 Literature Review: Qualitative Evaluation of Published Papers</title>
        <p>We searched PubMed and Web of Science between April 1, 2022, and May 22, 2022, to identify ML algorithms used to predict cardiac disease adhering to PRISMA (Preferred Reporting Items for Systematic Reviews and Meta-Analyses) guidelines for systematic reviews (<xref rid="figure2" ref-type="fig">Figure 2</xref> [<xref ref-type="bibr" rid="ref20">20</xref>] and Tables S1 and S2 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> [<xref ref-type="bibr" rid="ref21">21</xref>,<xref ref-type="bibr" rid="ref22">22</xref>]). All abstracts were reviewed, and papers were included for full-text review if they met the following criteria: (1) the target diagnosis was HF, (2) the model used biochemical markers to predict disease, and (3) the computational methods involved an ML approach (including supervised, unsupervised, and deep learning).</p>
        <fig id="figure2" position="float">
          <label>Figure 2</label>
          <caption>
            <p>PRISMA 2020 flow diagram for new systematic reviews which included searches of databases and registers only (PRISMA templated obtained from PRISMA at https://prisma-statement.org/prismastatement/flowdiagram.aspx)  

**Reasons for exclusion: Reason 1: The study did not focus on biochemical data or laboratory tests, instead utilizing different modalities (eg, visual data from radiological scans); Reason 2: The study did not use machine learning techniques (eg, it used traditional statistical methods); Reason 3: The study did not describe empirical research, involving the development of ML models for prediction of cardiac disease (eg, instead the paper was a review or commentary); Reason 4: The retrieved study was not a full paper, instead it was a conference or meeting abstract.</p>
          </caption>
          <graphic xlink:href="jmir_v26i1e46936_fig2.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <p>Of the retained papers, full texts were then reviewed to evaluate whether authors (1) reported the demographic make-up of data sets and (2) evaluated demographic inequities in algorithm performance, meaning that the authors specifically examined differences in algorithmic performance by demographic groups defined by protected characteristics [<xref ref-type="bibr" rid="ref17">17</xref>].</p>
        <p>Throughout the literature review, any identified open-source data sets were maintained for use in stage 2.</p>
      </sec>
      <sec>
        <title>Stage 2: Quantitative Evaluation of Model Performance</title>
        <p>Two open-source data sets were uncovered in our literature review: (1) data set 1: University of California Irvine for Heart Failure Prediction [<xref ref-type="bibr" rid="ref21">21</xref>] and (2) data set 2: University of California Irvine Cleveland Heart Disease data set for identifying coronary artery disease (CAD) [<xref ref-type="bibr" rid="ref22">22</xref>]. Descriptive statistics were performed on both data sets, evaluating the mean and variance of the data set variables for sexes separately, affected by disease or death (<xref ref-type="table" rid="table1">Table 1</xref> and Tables S3-S5 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>).</p>
        <table-wrap position="float" id="table1">
          <label>Table 1</label>
          <caption>
            <p>Descriptive statistics of the variables in data set 1 (heart failure; N=299), stratified by target (death) and sex<sup>a</sup>.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="320"/>
            <col width="170"/>
            <col width="180"/>
            <col width="170"/>
            <col width="160"/>
            <thead>
              <tr valign="top">
                <td>Variables</td>
                <td colspan="4">Sex and death (target variable)<sup>b</sup></td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td colspan="2">Female (sex=0; n=105)</td>
                <td colspan="2">Male (sex=1; n=194)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Survived (HF<sup>c</sup> death=0)</td>
                <td>Death (HF death=1)</td>
                <td>Survived (HF death=0)</td>
                <td>Death (HF death=1)</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>Total count, n (%)</td>
                <td>71 (67.62)</td>
                <td>34 (32.38)</td>
                <td>132 (68.04)</td>
                <td>62 (32.96)</td>
              </tr>
              <tr valign="top">
                <td>Age (years), mean (SD)</td>
                <td>58.6 (10.6)</td>
                <td>62.2 (12.3)</td>
                <td>58.8 (10.7)</td>
                <td>66.9 (13.5)</td>
              </tr>
              <tr valign="top">
                <td>Anemia (Boolean), mean (SD)</td>
                <td>0.5 (0.5)</td>
                <td>0.6 (0.5)</td>
                <td>0.4 (0.5)</td>
                <td>0.4 (0.5)</td>
              </tr>
              <tr valign="top">
                <td>Creatinine phosphokinase (mcg/L), mean (SD)</td>
                <td>462.0 (517.7)</td>
                <td>507.7 (779.7)</td>
                <td>582.8 (853.2)</td>
                <td>759.3 (1532.3)</td>
              </tr>
              <tr valign="top">
                <td>Diabetes mellitus (Boolean), mean (SD)</td>
                <td>0.5 (0.5)</td>
                <td>0.6 (0.5)</td>
                <td>0.4 (0.5)</td>
                <td>0.3 (0.5)</td>
              </tr>
              <tr valign="top">
                <td>Ejection fraction (percentage), mean (SD)</td>
                <td>41.9 (11.6)</td>
                <td>37.5 (14.6)</td>
                <td>39.4 (10.4)</td>
                <td>31.2 (10.7)</td>
              </tr>
              <tr valign="top">
                <td>High blood pressure (Boolean), mean (SD)</td>
                <td>0.4 (0.5)</td>
                <td>0.5 (0.5)</td>
                <td>0.3 (0.5)</td>
                <td>0.4 (0.5)</td>
              </tr>
              <tr valign="top">
                <td>Platelets (kiloplatelets/mL), mean (SD)</td>
                <td>289,757.6 (98,655.9)</td>
                <td>259,512.7 (107,588.6)</td>
                <td>254,232.4 (94,985.6)</td>
                <td>254,663.7 (94,060.8)</td>
              </tr>
              <tr valign="top">
                <td>Serum creatinine (mg/dL), mean (SD)</td>
                <td>1.1 (0.6)</td>
                <td>1.9 (1.6)</td>
                <td>1.2 (0.7)</td>
                <td>1.8 (1.4)</td>
              </tr>
              <tr valign="top">
                <td>Serum sodium (mEq/L), mean (SD)</td>
                <td>137.4 (3.6)</td>
                <td>135.5 (6.7)</td>
                <td>137.1 (4.2)</td>
                <td>135.3 (3.8)</td>
              </tr>
              <tr valign="top">
                <td>Smoking (Boolean), mean (SD)</td>
                <td>0.0 (0.1)</td>
                <td>0.1 (0.3)</td>
                <td>0.5 (0.5)</td>
                <td>0.4 (0.5)</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table1fn1">
              <p><sup>a</sup>Full details of data set variables are available in Tanvir et al [<xref ref-type="bibr" rid="ref21">21</xref>].</p>
            </fn>
            <fn id="table1fn2">
              <p><sup>b</sup>For the death variable, a value of 1 indicates mortality.</p>
            </fn>
            <fn id="table1fn3">
              <p><sup>c</sup>HF: heart failure.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <p>Using these data sets, we rebuilt the ML algorithms described in the published literature and performed an additional analysis exploring inequities in algorithmic performance for demographic subgroups. As the only protected characteristic reported was sex, we focus on sex disparities in performance. Despite our initial aim to focus on HF, we retained an uncovered CAD data set to investigate whether trends identified for HF generalized to patients with CAD [<xref ref-type="bibr" rid="ref22">22</xref>]. Tables S3 and S4 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> provide details on data set 1 and data set 2, respectively.</p>
      </sec>
      <sec>
        <title>Model Reproduction</title>
        <p>We rebuilt the models described in the existing literature for these data sets, focusing on random forest (RF) algorithms, which have been widely reported to be the most effective models [<xref ref-type="bibr" rid="ref23">23</xref>]. For both data sets, data was split into test or training subsets (0.7:0.3), RF models were built using SciKit Learn, and RF parameters were tuned using GridSearch CV (SciKit Learn). We adopted a bootstrapping approach to quantify uncertainty, such that models were built, trained, and tested 100 times, from which average results were derived with SD.</p>
      </sec>
      <sec>
        <title>Statistical Analysis</title>
        <p>Across the 100 runs, sex differences in each algorithm evaluation metric (equations 1-10) were calculated and averaged, with accompanying statistical tests performed to evaluate for statistical significance of any identified sex disparities. Our method for examining differences in algorithmic error rates builds on the foundational work from Buolamwini and Gebru [<xref ref-type="bibr" rid="ref24">24</xref>], who demonstrated that a range of ML algorithms for facial recognition performed poorly on darker-skinned female patients. To evaluate for statistical significance, independent 2-tailed <italic>t</italic> tests were performed where the data was normally distributed, and Mann-Whitney <italic>U</italic> tests were performed where the data was not normally distributed. Kolmogorov-Smirnov tests were used to assess for normality [<xref ref-type="bibr" rid="ref25">25</xref>].</p>
      </sec>
      <sec>
        <title>Variations in Model Development</title>
        <sec>
          <title>Overview</title>
          <p>We then introduced a variety of changes to the model development, to evaluate the impact on the identified sex disparities in performance.</p>
        </sec>
        <sec>
          <title>Changes to Model Training Data</title>
          <p>In total, 1 widely proposed bias mitigation technique includes preprocessing the training data of a model to account for demographic representation, with previous research highlighting the benefit of training on demographically balanced or demographically stratified data sets [<xref ref-type="bibr" rid="ref26">26</xref>]. We therefore created a range of data sets with varied sex representation and assessed for the impact on algorithm performance disparities. To form the sex-balanced data set, we used the oversampling function of <italic>SMOTE()</italic>, which has been proposed as an effective method for improving the representation of underserved populations in ML data sets [<xref ref-type="bibr" rid="ref27">27</xref>]. The <italic>SMOTE</italic> package generates new minority data points based on existing minority samples through linear interpolation [<xref ref-type="bibr" rid="ref26">26</xref>,<xref ref-type="bibr" rid="ref27">27</xref>]. Models were rebuilt as per the <italic>Model Reproduction</italic> section, using 4 different training data sets (sex-imbalanced, sex-balanced, and sex-specific; Tables S6 and S7 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>): (1) original sex-imbalanced training data, (2) sex-balanced training data, (3) female-only training data, and (4) male-only training data experiments.</p>
        </sec>
        <sec>
          <title>Changes to Feature Selection</title>
          <p>To understand why models make certain decisions, researchers in the domain of “explainable AI” have demonstrated how feature evaluation may provide important information regarding model performance for different subpopulations [<xref ref-type="bibr" rid="ref26">26</xref>,<xref ref-type="bibr" rid="ref28">28</xref>]. To do this, Shapley values have been widely accepted as a unified measure of feature importance since their proposal in 2017 [<xref ref-type="bibr" rid="ref29">29</xref>].</p>
          <p>In our experiments, we first perform an exploratory analysis, comparing feature importance for models trained on the male versus female data sets. Second, we create 4 feature subsets from the original data sets, to evaluate the impact of changing the feature selection on performance disparities. As described in the introduction, existing clinical research has described demographic differences in the biochemical and clinical markers of HF disease (eg, sex differences in EF and troponin levels) [<xref ref-type="bibr" rid="ref16">16</xref>]. Thus, we delineate 4 different feature subsets that vary in this information, to examine whether certain feature subsets perform better for different demographic groups. These four feature subsets are described in detail in Tables S8 and S9 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> and include (1) features with sex, (2) features without sex, (3) biochemical features, and (4) clinical features.</p>
          <p>Our final series of experiments are therefore performed across the four training data sets (sex-imbalanced, sex-balanced, and sex-specific), and the four feature sets giving 16 total experiments: (1) original sex-imbalanced training data experiments (across four feature subsets), (2) sex-balanced training data experiments (across four feature subsets), (3) female training data experiments (across four feature subsets), and (4) male training data experiments (across four feature subsets)</p>
        </sec>
      </sec>
      <sec>
        <title>Model Evaluation and Identification of Performance Disparities</title>
        <p>Models are evaluated using global evaluation metrics (eg, accuracy) and specific error rates (eg, false negative rate [FNR]; equations 1-10). The difference between male and female scores is calculated to give a model’s “sex performance disparity” (equation 10). To evaluate for statistical significance, Kolmogorov-Smirnov Tests were used to assess for the normality of the data, following which independent 2-tailed <italic>t</italic> tests were performed where the data were normally distributed, and Mann-Whitney <italic>U</italic> tests were performed where the data were not normally distributed.</p>
        <p>Our choice of evaluation metrics is guided by the clinical consequence of each of these scores.</p>
        <p>The existing research on algorithmic bias has highlighted the importance of examining error rates, particularly in medicine where a false negative clinically translates to missed diagnoses or opportunities for treatment [<xref ref-type="bibr" rid="ref3">3</xref>-<xref ref-type="bibr" rid="ref6">6</xref>,<xref ref-type="bibr" rid="ref26">26</xref>]. As described by Afrose and colleagues [<xref ref-type="bibr" rid="ref26">26</xref>], focusing on global metrics of performance such as area under the receiver operating characteristic curve scores can neglect subtler disparities arising from differences in error rates affecting subgroups. When selecting a bias assessment metric, previous studies have chosen to focus on FNR and false positive rate (FPR), due to the clinical implications of these errors [<xref ref-type="bibr" rid="ref4">4</xref>,<xref ref-type="bibr" rid="ref30">30</xref>,<xref ref-type="bibr" rid="ref31">31</xref>]. Equations 5-8 places the error rates in their clinical context, demonstrating that the FNR represents missed diagnoses and potentially missed treatment. For the error rates, we use the threshold of 0.5, as we are investigating performance inequities in the existing reported models that used these default settings.</p>
        <p>Error rate definitions are as follows:</p>
        <graphic xlink:href="jmir_v26i1e46936_fig8.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        <graphic xlink:href="jmir_v26i1e46936_fig9.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        <graphic xlink:href="jmir_v26i1e46936_fig10.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        <graphic xlink:href="jmir_v26i1e46936_fig11.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        <p>Clinical implications of error rates are as follows:</p>
        <disp-formula>
          <italic>True Positive Rate = Correct diagnosis that patient as disease</italic>
          <bold>(5)</bold>
        </disp-formula>
        <disp-formula>
          <italic>False Positive Rate = Misdiagnosis of disease when patient is healthy</italic>
          <bold>(6)</bold>
        </disp-formula>
        <disp-formula>
          <italic>True Negative Rate = Correct diagnosis that patient is healthy</italic>
          <bold>(7)</bold>
        </disp-formula>
        <disp-formula>
          <italic>False Negative Rate = Misdiagnosis that patient is healthy when patient has disease</italic>
          <bold>(8)</bold>
        </disp-formula>
        <p>The accuracy evaluation metric is calculated as follows:</p>
        <graphic xlink:href="jmir_v26i1e46936_fig12.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        <p>Sex performance disparity is calculated as follows:</p>
        <disp-formula>
          <italic>Sex performance disparity = Score for male patients (mean) – Score for female patients (mean)</italic>
          <bold>(10)</bold>
        </disp-formula>
      </sec>
      <sec>
        <title>Fairness Techniques: Fair Adversarial Gradient Tree Boosting</title>
        <p>We implemented a recent fairness technique to evaluate whether these approaches applied to bias in HF algorithms. The Fair Adversarial Gradient Tree Boosting (FAGTB) is a recent technique proposed by Grari et al [<xref ref-type="bibr" rid="ref8">8</xref>] for mitigating bias in decision tree classifiers and the authors demonstrate the success of their technique on 4 data sets. The authors focus on 2 definitions of fairness: demographic parity and equalized odds [<xref ref-type="bibr" rid="ref8">8</xref>]. The equalized odds metric focuses on model FPR and FNR, and hence we highlight this for our paper. A summary of these fairness metrics is provided in Section S1 in <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref> for further interest.</p>
        <p>The definition of equalized odds is as follows:</p>
        <graphic xlink:href="jmir_v26i1e46936_fig13.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        <p>To assess for the equalized odds the authors measure the disparate mistreatment, which computes the absolute difference between FPR and the FNR for both demographics.</p>
        <p>The disparate FPR is calculated as follows:</p>
        <graphic xlink:href="jmir_v26i1e46936_fig14.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        <p>The disparate FNR is calculated as follows:</p>
        <graphic xlink:href="jmir_v26i1e46936_fig15.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        <p>We compare the performance of the FAGTB algorithm to a standard Gradient Tree Algorithm. As per the original FAGTB paper, we repeat 10 experiments randomly sampling 2 subsets (0.8:0.2) and report evaluation metrics for the test set.</p>
      </sec>
      <sec>
        <title>Ethical Considerations</title>
        <p>Ethical approval was not required for this study as all data used were sourced from publicly available open-source data sets [<xref ref-type="bibr" rid="ref21">21</xref>,<xref ref-type="bibr" rid="ref22">22</xref>] under a CC-BY 4.0 license. No direct patient contact or sensitive personal data was involved, ensuring compliance with research standards.</p>
      </sec>
    </sec>
    <sec sec-type="results">
      <title>Results</title>
      <sec>
        <title>Literature Review Search Results</title>
        <p>Our search returned 127 papers, of which 60 met the criteria for full review and 3 highlighted sex differences in model performance. In the papers that reported sex, there was a consistent underrepresentation of female patients. No papers investigated racial or ethnic differences. Further, 1 paper focused specifically on female patients with HF, in which Tison et al [<xref ref-type="bibr" rid="ref32">32</xref>] highlighted that HF was more common in people who were older, White, with a higher mean number of pregnancies, a higher BMI, and were less likely to have Medicare.</p>
      </sec>
      <sec>
        <title>Descriptive Statistics and Feature Importance</title>
        <sec>
          <title>Data Set 1 (HF)</title>
          <p>The mean descriptive statistics for each feature present in the HF data set are provided in <xref ref-type="table" rid="table1">Table 1</xref>, which demonstrates subtle sex differences in the presentation of the disease. For HF deaths, male patients tend to be older than their female counterparts, with a higher creatinine phosphokinase, lower likelihood of diabetes, lower EF, and lower blood pressure.</p>
          <p>Our exploratory analysis identified further sex differences on examining feature importance. <xref rid="figure3" ref-type="fig">Figure 3</xref> compares the rankings of feature importance for ML models built to predict HF built from the female data set compared to the male data set. These differences are important as existing ML algorithms built on mixed-sex cohorts suggest that EF can be used alone for modeling, an approach that may disadvantage female patients [<xref ref-type="bibr" rid="ref23">23</xref>].</p>
          <fig id="figure3" position="float">
            <label>Figure 3</label>
            <caption>
              <p>Comparison of feature rankings for male and female patients, ordered by SHA<italic>P</italic> values. SHAP: Shapley additive explanations.</p>
            </caption>
            <graphic xlink:href="jmir_v26i1e46936_fig3.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </fig>
        </sec>
        <sec>
          <title>Data Set 2 (CAD)</title>
          <p>Table S5 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> provides details of the CAD data set and demonstrates that female patients with CAD have higher resting blood pressure and higher cholesterol compared to male patients. The categorical variable “resting electrocardiogram” is also higher for female patients, due to a higher incidence of left ventricular hypertrophy.</p>
        </sec>
      </sec>
      <sec>
        <title>Model Results and Performance Disparities</title>
        <p>We replicated the algorithms described in the existing literature, reproducing the same previously reported mean predictive accuracies of 84.24% (3.51 SD) for data set 1 and 85.72% (1.75 SD) for data set 2 [<xref ref-type="bibr" rid="ref23">23</xref>]. In <xref ref-type="table" rid="table2">Tables 2</xref> and <xref ref-type="table" rid="table3">3</xref>, we present the disparity in performance for the sexes, where a positive value indicates a higher value for male patients (see equation 10).</p>
        <p>For data set 1, <xref ref-type="table" rid="table2">Table 2</xref> demonstrates that in 13 out of 16 experiments, the FNR is higher for female patients, meeting the threshold of statistical significance (mean difference of –17.81% to –3.37%; <italic>P</italic>&#60;.05). <xref rid="figure4" ref-type="fig">Figure 4</xref> represents this disparity in performance graphically, providing the point estimates of FNR for the sexes separately and highlighting that the disparity in FNR persisted across the variations in training data and selected features.</p>
        <table-wrap position="float" id="table2">
          <label>Table 2</label>
          <caption>
            <p>Sex performance disparities for models built from data set 1 (heart failure disease)–sex performance disparities are calculated as the performance for male patients minus the performance for female patients (see equation 10). Thus, a positive value indicates a higher score for male patients and a negative value indicates a higher score for female patients. All disparities are presented alongside results of significance testing, where significant differences between the sexes are highlighted with a footnote (<italic>P</italic>&#60;.05).</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="30"/>
            <col width="170"/>
            <col width="110"/>
            <col width="80"/>
            <col width="110"/>
            <col width="90"/>
            <col width="130"/>
            <col width="90"/>
            <col width="100"/>
            <col width="90"/>
            <thead>
              <tr valign="top">
                <td colspan="2">Disparity in model performance (score for male patients – score for female patients)</td>
                <td colspan="8">Feature subset used in model training</td>
              </tr>
              <tr valign="top">
                <td colspan="2">
                  <break/>
                </td>
                <td>Features with sex</td>
                <td><italic>P</italic> value</td>
                <td>Features without sex</td>
                <td><italic>P</italic> value</td>
                <td>Biochemical features</td>
                <td><italic>P</italic> value</td>
                <td>Clinical features</td>
                <td><italic>P</italic> value</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td colspan="10">
                  <bold>Sex-imbalanced training data</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Accuracy disparity (%)</td>
                <td>1.63</td>
                <td>.03<sup>a</sup></td>
                <td>–0.72</td>
                <td>.30</td>
                <td>0.10</td>
                <td>.88</td>
                <td>–0.50</td>
                <td>.49</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>ROC_AUC<sup>b</sup> disparity (%)</td>
                <td>3.14</td>
                <td>&#60;.01<sup>a</sup></td>
                <td>0.43</td>
                <td>.61</td>
                <td>1.51</td>
                <td>.09</td>
                <td>0.47</td>
                <td>.60</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>FNR<sup>c</sup> disparity (%)</td>
                <td>–7.53</td>
                <td>&#60;.01<sup>a</sup></td>
                <td>–3.84</td>
                <td>.02<sup>a</sup></td>
                <td>–5.15</td>
                <td>.01<sup>a</sup></td>
                <td>–3.49</td>
                <td>.049<sup>a</sup></td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>FPR<sup>d</sup> disparity (%)</td>
                <td>1.26</td>
                <td>.07</td>
                <td>2.97</td>
                <td>&#60;.01<sup>a</sup></td>
                <td>2.11</td>
                <td>&#60;.01<sup>a</sup></td>
                <td>2.56</td>
                <td>&#60;.01<sup>a</sup></td>
              </tr>
              <tr valign="top">
                <td colspan="10">
                  <bold>Sex-balanced training data</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Accuracy disparity (%)</td>
                <td>–4.78</td>
                <td>&#60;.01<sup>a</sup></td>
                <td>–7.25</td>
                <td>&#60;.01<sup>a</sup></td>
                <td>–9.42</td>
                <td>&#60;.01<sup>a</sup></td>
                <td>–3.63</td>
                <td>&#60;.01<sup>a</sup></td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>ROC_AUC disparity (%)</td>
                <td>7.0</td>
                <td>&#60;.01<sup>a</sup></td>
                <td>4.27</td>
                <td>&#60;.01<sup>a</sup></td>
                <td>0.15</td>
                <td>.83</td>
                <td>8.32</td>
                <td>&#60;.01<sup>a</sup></td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>FNR disparity (%)</td>
                <td>–17.81</td>
                <td>&#60;.01<sup>a</sup></td>
                <td>–13.91</td>
                <td>&#60;.01<sup>a</sup></td>
                <td>–3.37</td>
                <td>.04<sup>a</sup></td>
                <td>–16.09</td>
                <td>&#60;.01<sup>a</sup></td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>FPR disparity (%)</td>
                <td>3.90</td>
                <td>&#60;.01<sup>a</sup></td>
                <td>5.37</td>
                <td>&#60;.01<sup>a</sup></td>
                <td>3.07</td>
                <td>&#60;.001<sup>a</sup></td>
                <td>–0.54</td>
                <td>.24</td>
              </tr>
              <tr valign="top">
                <td colspan="10">
                  <bold>Female training data</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Accuracy disparity (%)</td>
                <td>–10.95</td>
                <td>&#60;.01<sup>a</sup></td>
                <td>–9.75</td>
                <td>&#60;.01<sup>a</sup></td>
                <td>–12.32</td>
                <td>&#60;.01<sup>a</sup></td>
                <td>–9.64</td>
                <td>&#60;.01<sup>a</sup></td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>ROC_AUC disparity (%)</td>
                <td>0.60</td>
                <td>.57</td>
                <td>0.57</td>
                <td>.23</td>
                <td>–2.92</td>
                <td>&#60;.01<sup>a</sup></td>
                <td>–0.53</td>
                <td>.07</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>FNR disparity (%)</td>
                <td>–7.42</td>
                <td>&#60;.01<sup>a</sup></td>
                <td>–10.91</td>
                <td>&#60;.01<sup>a</sup></td>
                <td>–2.24</td>
                <td>.27</td>
                <td>1.55</td>
                <td>.01<sup>a</sup></td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>FPR disparity (%)</td>
                <td>8.61</td>
                <td>&#60;.01<sup>a</sup></td>
                <td>9.77</td>
                <td>&#60;.01<sup>a</sup></td>
                <td>8.08</td>
                <td>&#60;.01<sup>a</sup></td>
                <td>–0.48</td>
                <td>.04<sup>a</sup></td>
              </tr>
              <tr valign="top">
                <td colspan="10">
                  <bold>Male training data</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Accuracy disparity (%)</td>
                <td>–5.46</td>
                <td>&#60;.01<sup>a</sup></td>
                <td>–5.73</td>
                <td>&#60;.01<sup>a</sup></td>
                <td>–8.73</td>
                <td>&#60;.01<sup>a</sup></td>
                <td>–2.46</td>
                <td>&#60;.01<sup>a</sup></td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>ROC_AUC disparity (%)</td>
                <td>4.98</td>
                <td>&#60;.01<sup>a</sup></td>
                <td>4.54</td>
                <td>&#60;.01<sup>a</sup></td>
                <td>–1.59</td>
                <td>.049<sup>a</sup></td>
                <td>8.32</td>
                <td>&#60;.01<sup>a</sup></td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>FNR disparity (%)</td>
                <td>–13.96</td>
                <td>&#60;.01<sup>a</sup></td>
                <td>–13.32</td>
                <td>&#60;.01<sup>a</sup></td>
                <td>–1.68</td>
                <td>.33</td>
                <td>–16.58</td>
                <td>&#60;.01<sup>a</sup></td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>FPR disparity (%)</td>
                <td>4.00</td>
                <td>&#60;.01<sup>a</sup></td>
                <td>4.24</td>
                <td>&#60;.01<sup>a</sup></td>
                <td>4.86</td>
                <td>&#60;.01<sup>a</sup></td>
                <td>–0.06</td>
                <td>.35</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table2fn1">
              <p><sup>a</sup>Indicates a statistically significant difference (<italic>P</italic>&#60;.05) between the model’s performance on male versus female patients.</p>
            </fn>
            <fn id="table2fn2">
              <p><sup>a</sup>ROC_AUC: area under the receiver operating characteristic curve.</p>
            </fn>
            <fn id="table2fn3">
              <p><sup>b</sup>FNR: false negative rate.</p>
            </fn>
            <fn id="table2fn4">
              <p><sup>c</sup>FPR: false positive rate.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <table-wrap position="float" id="table3">
          <label>Table 3</label>
          <caption>
            <p>Sex performance disparities for models built from data set 2 (coronary artery disease)—sex performance disparities are calculated as the performance for male patients minus the performance for female patients (see equation 10). Thus, a positive value indicates a higher score for male patients, and a negative value indicates a higher score for female patients. All disparities are presented alongside results of significance testing, where significant differences between the sexes are highlighted with a footnote (<italic>P</italic>&#60;.05).</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="30"/>
            <col width="170"/>
            <col width="120"/>
            <col width="80"/>
            <col width="120"/>
            <col width="80"/>
            <col width="130"/>
            <col width="80"/>
            <col width="110"/>
            <col width="80"/>
            <thead>
              <tr valign="top">
                <td colspan="2">Disparity in model performance (score for male patients – score for female patients)</td>
                <td colspan="8">Feature subset used in model training</td>
              </tr>
              <tr valign="top">
                <td colspan="2">
                  <break/>
                </td>
                <td>Features with sex</td>
                <td><italic>P</italic> value</td>
                <td>Features without sex</td>
                <td><italic>P</italic> value</td>
                <td>Biochemical features</td>
                <td><italic>P</italic> value</td>
                <td>Clinical features</td>
                <td><italic>P</italic> value</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td colspan="10">
                  <bold>Sex-imbalanced training data</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Accuracy disparity (%)</td>
                <td>0.32</td>
                <td>.50</td>
                <td>0.64</td>
                <td>.17</td>
                <td>0.13</td>
                <td>.80</td>
                <td>0.25</td>
                <td>.61</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>ROC_AUC<sup>b</sup> disparity (%)</td>
                <td>3.86</td>
                <td>&#60;.01<sup>a</sup></td>
                <td>4.24</td>
                <td>&#60;.01<sup>a</sup></td>
                <td>3.05</td>
                <td>&#60;.01<sup>a</sup></td>
                <td>3.91</td>
                <td>&#60;.01<sup>a</sup></td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>FNR<sup>c</sup> disparity (%)</td>
                <td>–11.66</td>
                <td>&#60;.01<sup>a</sup></td>
                <td>–12.52</td>
                <td>&#60;.01<sup>a</sup></td>
                <td>–10.81</td>
                <td>&#60;.01<sup>a</sup></td>
                <td>–12.38</td>
                <td>&#60;.01<sup>a</sup></td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>FPR<sup>d</sup> disparity (%)</td>
                <td>3.94</td>
                <td>&#60;.01<sup>a</sup></td>
                <td>4.04</td>
                <td>&#60;.01<sup>a</sup></td>
                <td>4.71</td>
                <td>&#60;.01<sup>a</sup></td>
                <td>4.57</td>
                <td>&#60;.01<sup>a</sup></td>
              </tr>
              <tr valign="top">
                <td colspan="10">
                  <bold>Sex-balanced training data</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Accuracy disparity (%)</td>
                <td>–4.01</td>
                <td>&#60;.01<sup>a</sup></td>
                <td>–5.12</td>
                <td>&#60;.01<sup>a</sup></td>
                <td>–7.32</td>
                <td>&#60;.01<sup>a</sup></td>
                <td>–2.86</td>
                <td>&#60;.01<sup>a</sup></td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>ROC_AUC disparity (%)</td>
                <td>–3.89</td>
                <td>.01<sup>a</sup></td>
                <td>–4.91</td>
                <td>.01<sup>a</sup></td>
                <td>–7.18</td>
                <td>&#60;.001<sup>a</sup></td>
                <td>–2.75</td>
                <td>&#60;.01<sup>a</sup></td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>FNR disparity (%)</td>
                <td>7.69</td>
                <td>&#60;.01<sup>a</sup></td>
                <td>10.54</td>
                <td>&#60;.01<sup>a</sup></td>
                <td>15.59</td>
                <td>&#60;.01<sup>a</sup></td>
                <td>6.61</td>
                <td>&#60;.01<sup>a</sup></td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>FPR disparity (%)</td>
                <td>0.10</td>
                <td>.87</td>
                <td>–0.72</td>
                <td>.19</td>
                <td>–1.23</td>
                <td>.29</td>
                <td>–1.11</td>
                <td>.06</td>
              </tr>
              <tr valign="top">
                <td colspan="10">
                  <bold>Female training data</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Accuracy disparity (%)</td>
                <td>–9.25</td>
                <td>&#60;.01<sup>a</sup></td>
                <td>–11.34</td>
                <td>&#60;.01<sup>a</sup></td>
                <td>–11.49</td>
                <td>&#60;.01<sup>a</sup></td>
                <td>–8.69</td>
                <td>&#60;.01<sup>a</sup></td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>ROC_AUC disparity (%)</td>
                <td>–8.97</td>
                <td>&#60;.01<sup>a</sup></td>
                <td>–10.95</td>
                <td>&#60;.01<sup>a</sup></td>
                <td>–11.10</td>
                <td>&#60;.01<sup>a</sup></td>
                <td>–8.45</td>
                <td>&#60;.01<sup>a</sup></td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>FNR disparity (%)</td>
                <td>18.98</td>
                <td>&#60;.01<sup>a</sup></td>
                <td>22.60</td>
                <td>&#60;.01<sup>a</sup></td>
                <td>27.23</td>
                <td>&#60;.01<sup>a</sup></td>
                <td>17.86</td>
                <td>&#60;.01<sup>a</sup></td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>FPR disparity (%)</td>
                <td>–1.04</td>
                <td>.07</td>
                <td>–0.70</td>
                <td>.20</td>
                <td>–5.02</td>
                <td>&#60;.01<sup>a</sup></td>
                <td>–0.96</td>
                <td>.09</td>
              </tr>
              <tr valign="top">
                <td colspan="10">
                  <bold>Male training data</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Accuracy disparity (%)</td>
                <td>6.38</td>
                <td>&#60;.01<sup>a</sup></td>
                <td>5.66</td>
                <td>&#60;.01<sup>a</sup></td>
                <td>–1.66</td>
                <td>.02<sup>a</sup></td>
                <td>6.10</td>
                <td>&#60;.01<sup>a</sup></td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>ROC_AUC disparity (%)</td>
                <td>6.30</td>
                <td>&#60;.01<sup>a</sup></td>
                <td>5.57</td>
                <td>&#60;.01<sup>a</sup></td>
                <td>1.52</td>
                <td>.07</td>
                <td>5.86</td>
                <td>.01<sup>a</sup></td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>FNR disparity (%)</td>
                <td>–10.12</td>
                <td>&#60;.01<sup>a</sup></td>
                <td>–10.10</td>
                <td>&#60;.001<sup>a</sup></td>
                <td>1.67</td>
                <td>.17</td>
                <td>–12.64</td>
                <td>&#60;.01<sup>a</sup></td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>FPR disparity (%)</td>
                <td>–2.48</td>
                <td>&#60;.01<sup>a</sup></td>
                <td>–1.04</td>
                <td>.07</td>
                <td>1.38</td>
                <td>.24</td>
                <td>0.92</td>
                <td>.15</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table3fn1">
              <p><sup>a</sup>Indicates a statistically significant difference (<italic>P</italic>&#60;.05) between the model’s performance on male versus female patients. To determine statistical significance, the Kolmogorov-Smirnov tests were first run on the sex-stratified results to determine the distribution of data (normal or not). Independent 2-tailed <italic>t</italic> tests were used where data were normally distributed, and Mann-Whitney <italic>U</italic> tests were used when data were not normally distributed.</p>
            </fn>
            <fn id="table3fn2">
              <p><sup>b</sup>ROC_AUC: area under the receiver operating characteristic curve.</p>
            </fn>
            <fn id="table3fn3">
              <p><sup>c</sup>FNR: false negative rate.</p>
            </fn>
            <fn id="table3fn4">
              <p><sup>d</sup>FPR: false positive rate.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <p>A smaller disparity in the FPR was statistically significant for male patients in 13 out of 16 experiments (–0.48% to +9.77%; <italic>P</italic>&#60;.05). The sex performance disparities in accuracy and area under the receiver operating characteristic curve varied depending on the underlying shifts in the error rates for each sex (<xref ref-type="table" rid="table2">Table 2</xref> and <xref rid="figure5" ref-type="fig">Figure 5</xref>). On examining the individual error rates, we see consistencies in the sex disparities across feature sets, most notably an overprediction of disease for male patients (higher FPR) and an underprediction of disease for female patients (higher FNR: <xref ref-type="table" rid="table2">Table 2</xref>).</p>
        <fig id="figure4" position="float">
          <label>Figure 4</label>
          <caption>
            <p>Data set 1 (heart failure): a series of violin plots showing the sex-stratified performance (false negative rate [0%-100%]) of the random forests trained across the 4 feature sets, on the different variations in training data. The plots show male (orange) and female (gray) FNR alongside each other, in groups of 4 (divided by a line) according to the training data used (sex-imbalanced, sex-balanced, female, and male). The feature set used is indicated within each training data group (features with sex, features without sex, biochemical features, and clinical features). See Multimedia Appendixes.</p>
          </caption>
          <graphic xlink:href="jmir_v26i1e46936_fig4.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <fig id="figure5" position="float">
          <label>Figure 5</label>
          <caption>
            <p>Data set 1 (heart failure): a series of violin plots showing the sex-stratified performance (accuracy [0%-100%]) of the random forests trained across the 4 feature sets, on the different variations in training data. The plots show male (orange) and female (gray) accuracy alongside each other, in groups of 4 (divided by a line) according to the training data used (sex-imbalanced, sex-balanced, female, and male). The feature set used is indicated within each training data group (features with sex, features without sex, biochemical features, and clinical features). See Multimedia Appendixes.</p>
          </caption>
          <graphic xlink:href="jmir_v26i1e46936_fig5.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <p>Our findings for data set 2 were similar to those for data set 1, such that models built on the original sex-imbalanced data set demonstrated a higher FNR for female patients (mean difference of –10.81% to –12.52%; <italic>P</italic>&#60;.05; <xref ref-type="table" rid="table3">Table 3</xref>) and a higher FPR for male patients (3.94% to 4.71%; <italic>P</italic>&#60;.05; <xref ref-type="table" rid="table3">Table 3</xref>). <xref rid="figure6" ref-type="fig">Figure 6</xref> visualizes the disparity graphically, and demonstrates that, unlike data set 1, the disparity in error rates reversed when training on sex-balanced data and female-only data (<xref rid="figure6" ref-type="fig">Figure 6</xref>). <xref rid="figure7" ref-type="fig">Figure 7</xref> illustrates the disparity in accuracy between the sexes, where we see that the direction of the disparity varies depending on the training data and feature set (<xref rid="figure7" ref-type="fig">Figure 7</xref>).</p>
        <fig id="figure6" position="float">
          <label>Figure 6</label>
          <caption>
            <p>Data set 2 (coronary artery disease): a series of violin plots showing the sex-stratified performance (false negative rate [0%-100%]) of the random forests trained across the 4 feature sets, on the different variations in training data. The plots show male (orange) and female (gray) FNR alongside each other, in groups of 4 (divided by a line) according to the training data used (sex-imbalanced, sex-balanced, female, and male). The feature set used is indicated within each training data group (features with sex, features without sex, biochemical features, and clinical features). See Multimedia Appendixes. FNR: false negative rate.</p>
          </caption>
          <graphic xlink:href="jmir_v26i1e46936_fig6.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <fig id="figure7" position="float">
          <label>Figure 7</label>
          <caption>
            <p>Data set 2 (coronary artery disease): a series of violin plots showing the sex-stratified performance (accuracy [0%-100%]) of the random forests trained across the 4 feature sets, on the variations in training data. The plots show male (orange) and female (gray) accuracy alongside each other, in groups of 4 (divided by a line) according to the training data used (sex-imbalanced, sex-balanced, female, and male). The feature set used is indicated within each training data group (features with sex, features without sex, biochemical features, and clinical features).</p>
          </caption>
          <graphic xlink:href="jmir_v26i1e46936_fig7.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Variations in Training Data</title>
        <sec>
          <title>Sex-Balanced Training Data</title>
          <p>Training on sex-balanced data led to a fall in mean accuracy for all patients in data set 1 (76%, SD 3.46% vs 84.24%, SD 3.51%), with a more substantial drop in mean accuracy for male patients (73.61%, SD 4.84% vs 84.84%, SD 4.16%; <xref ref-type="table" rid="table4">Table 4</xref> and <xref rid="figure5" ref-type="fig">Figure 5</xref>). The opposite trend was seen in data set 2, with models trained on sex-balanced data outperforming models trained on sex-imbalanced data for all patients (87.65%, SD 1.77% vs 85.72%, SD 1.75%) and for female patients (89.66%, SD 2.44% vs 85.48%, SD 4.12%; <xref ref-type="table" rid="table4">Table 4</xref>). The models trained on sex-balanced data in data set 2 reduced the FNR for both sexes when using the full feature set (female patients 4.79%, SD 2.58% vs 24.86%, SD 11.35%; male patients 12.48%, SD 4.11% vs 13.19%, SD 3.26%; <xref ref-type="table" rid="table4">Table 4</xref> and <xref rid="figure6" ref-type="fig">Figure 6</xref>). The differences between the data sets may relate to underlying differences in the 2 cardiac conditions. Further, the failure to improve performance with sex-balanced training data may reflect the issues of mixing data that has conflicting indicators for disease.</p>
          <table-wrap position="float" id="table4">
            <label>Table 4</label>
            <caption>
              <p>Model results when trained on sex-specific subsets for all patients and male or female patients separately, looking at the “features including sex” subset.</p>
            </caption>
            <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
              <col width="110"/>
              <col width="140"/>
              <col width="110"/>
              <col width="100"/>
              <col width="100"/>
              <col width="0"/>
              <col width="130"/>
              <col width="110"/>
              <col width="100"/>
              <col width="100"/>
              <thead>
                <tr valign="top">
                  <td>Results</td>
                  <td colspan="5">Data set 1 (heart failure)</td>
                  <td colspan="4">Data set 2 (coronary artery disease)</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Sex-imbalanced training data (n=209)</td>
                  <td>Sex-balanced training data (n=272)</td>
                  <td>Female training data (n=136)</td>
                  <td>Male training data (n=136)</td>
                  <td colspan="2">Sex-imbalanced training data (n=522)</td>
                  <td>Sex-balanced training data (n=715)</td>
                  <td>Female training data (n=358)</td>
                  <td>Male training data (n=358)</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>
                    <break/>
                  </td>
                  <td>
                    <break/>
                  </td>
                  <td>
                    <break/>
                  </td>
                  <td>
                    <break/>
                  </td>
                  <td colspan="2">
                    <break/>
                  </td>
                  <td>
                    <break/>
                  </td>
                  <td>
                    <break/>
                  </td>
                  <td>
                    <break/>
                  </td>
                </tr>
              </thead>
              <tbody>
                <tr valign="top">
                  <td>All patients, mean accuracy (SD)</td>
                  <td>84.24 (3.51)</td>
                  <td>76.0 (3.46)</td>
                  <td>74.68 (3.53)</td>
                  <td>75.12 (3.71)</td>
                  <td colspan="2">85.72 (1.75)</td>
                  <td>87.65 (1.77)</td>
                  <td>86.06 (1.67)</td>
                  <td>82.63 (1.94)</td>
                </tr>
                <tr valign="top">
                  <td>Female patients, mean accuracy (SD)</td>
                  <td>83.21 (6.37)</td>
                  <td>78.39 (19.68)</td>
                  <td>80.15 (4.43)</td>
                  <td>77.85 (5.21)</td>
                  <td colspan="2">85.48 (4.12)</td>
                  <td>89.66 (2.44)</td>
                  <td>90.69 (2.38)</td>
                  <td>79.44 (3.20)</td>
                </tr>
                <tr valign="top">
                  <td>Male patients, mean accuracy (SD)</td>
                  <td>84.84 (4.16)</td>
                  <td>73.61 (4.84)</td>
                  <td>69.20 (5.96)</td>
                  <td>72.39 (5.32)</td>
                  <td colspan="2">85.80 (2.14)</td>
                  <td>85.65 (2.23)</td>
                  <td>81.44 (3.02)</td>
                  <td>85.82 (2.30).</td>
                </tr>
                <tr valign="top">
                  <td>Female patients, mean FNR<sup>a</sup> (SD)</td>
                  <td>35.98 (16.72)</td>
                  <td>85.25 (14.58)</td>
                  <td>74.04 (17.68)</td>
                  <td>78.66 (14.0)</td>
                  <td colspan="2">24.86 (11.35)</td>
                  <td>4.79 (2.58)</td>
                  <td>4.00 (2.74)</td>
                  <td>22.32 (5.25)</td>
                </tr>
                <tr valign="top">
                  <td>Male patients, mean FNR (SD)</td>
                  <td>28.45 (10.41)</td>
                  <td>67.43 (16.6)</td>
                  <td>66.62 (17.32)</td>
                  <td>64.70 (14.9)</td>
                  <td colspan="2">13.19 (3.26)</td>
                  <td>12.48 (4.11)</td>
                  <td>22.97 (5.20)</td>
                  <td>12.20 (3.41)</td>
                </tr>
              </tbody>
            </table>
            <table-wrap-foot>
              <fn id="table4fn1">
                <p><sup>a</sup>FNR: false negative rate.</p>
              </fn>
            </table-wrap-foot>
          </table-wrap>
        </sec>
        <sec>
          <title>Sex-Specific Training Data</title>
          <p>For data set 1, mean accuracy for all patients when trained on sex-imbalanced data (84.24%, SD 3.51%) falls when training both on female-specific data (74.68%, SD 3.53%) and male-specific training data (75.12%, SD 3.71%), likely related to the smaller training data. For data set 2, mean accuracy for all patients when trained on sex-imbalanced data (85.72%, SD 1.75%) improves when training on female-specific data (86.06%, SD 1.67%) and falls when training on male-specific training data (82.62%, SD 1.94%). The overall improvement seen in the data set 2 models when trained on female data, relates to the increase in accuracy for female patients (90.69%, SD 2.38% vs 85.48%, SD 4.12%) co-occurring with a smaller decrease in accuracy for male patients (81.44%, SD 3.02% vs 85.80%, SD 2.14%; <xref ref-type="table" rid="table3">Table 3</xref> and <xref rid="figure7" ref-type="fig">Figure 7</xref>).</p>
          <p>Unsurprisingly, performance for each sex is lowest when trained on the opposing sex (<xref ref-type="table" rid="table4">Table 4</xref>, <xref rid="figure4" ref-type="fig">Figures 4</xref>-<xref rid="figure7" ref-type="fig">7</xref>). In data set 1, same-sex training was preferable to opposite-sex training; however, this did not improve results compared to the models built from sex-imbalanced and sex-balanced training data, likely relating to the smaller sample size (<xref ref-type="table" rid="table4">Table 4</xref>). In contrast, data set 2 had greater training data available and demonstrated that sex-specific training is beneficial to both sexes above the sex-imbalanced models (<xref ref-type="table" rid="table4">Table 4</xref>).</p>
        </sec>
        <sec>
          <title>Variations in Feature Sets</title>
          <p>Models built on the biochemical features subset gave the worst performance in terms of accuracy and FNR (<xref rid="figure4" ref-type="fig">Figures 4</xref>-<xref rid="figure7" ref-type="fig">7</xref>). For data set 2, biochemical features included just cholesterol and fasting blood sugar, and so, the fall in performance may relate to information loss. Additionally, Table S5 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> highlights the different biochemical profiles for male and female patients who were sick, with female patients who were sick demonstrating a far higher cholesterol level than their male counterparts (mean values: 279.2 female patients who were sick vs 247.5 male patients who were sick).</p>
        </sec>
        <sec>
          <title>FAGTB Model</title>
          <p>The disparity in false negative rate (DispFNR) was consistently higher than the disparity in false positive rate (<xref ref-type="table" rid="table5">Table 5</xref>). Compared to the Gradient Boosting Classifier, the FAGTB reduced the DispFNR for both data sets (data set 1: 0.20 vs 0.21; data set 2: 0.19 vs 0.28), however, the DispFNR that disadvantaged female patients persisted. The fall in DispFNR and disparity in false positive rate that occurred with FAGTB was associated with a fall in overall accuracy for both data sets.</p>
          <table-wrap position="float" id="table5">
            <label>Table 5</label>
            <caption>
              <p>Results of bias mitigation with Fair Adversarial Gradient Tree Boosting (FAGTB).</p>
            </caption>
            <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
              <col width="30"/>
              <col width="420"/>
              <col width="0"/>
              <col width="420"/>
              <col width="0"/>
              <col width="130"/>
              <thead>
                <tr valign="top">
                  <td colspan="3">Results on test set, averaged over 10 experiments</td>
                  <td colspan="2">Gradient boosting classifier</td>
                  <td>FAGTB</td>
                </tr>
              </thead>
              <tbody>
                <tr valign="top">
                  <td colspan="6">
                    <bold>Data set 1 (heart failure): experiments run on sex-imbalanced data with all features (averaged over 10 experiments)</bold>
                  </td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Accuracy</td>
                  <td colspan="2">71.3</td>
                  <td colspan="2">71.2</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>DispFPR<sup>a</sup></td>
                  <td colspan="2">0.08</td>
                  <td colspan="2">0.08</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>DispFNR<sup>b</sup></td>
                  <td colspan="2">0.21</td>
                  <td colspan="2">0.20</td>
                </tr>
                <tr valign="top">
                  <td colspan="6">
                    <bold>Data set 2 (coronary artery disease): experiments run on sex-imbalanced data with all features (averaged over 10 experiments)</bold>
                  </td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Accuracy</td>
                  <td colspan="2">86.3</td>
                  <td colspan="2">82.9</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>DispFPR</td>
                  <td colspan="2">0.06</td>
                  <td colspan="2">0.06</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>DispFNR</td>
                  <td colspan="2">0.28</td>
                  <td colspan="2">0.19</td>
                </tr>
              </tbody>
            </table>
            <table-wrap-foot>
              <fn id="table5fn1">
                <p><sup>a</sup>DispFPR: disparity in false positive rate.</p>
              </fn>
              <fn id="table5fn2">
                <p><sup>b</sup>DispFNR: disparity in false negative rate.</p>
              </fn>
            </table-wrap-foot>
          </table-wrap>
        </sec>
      </sec>
    </sec>
    <sec sec-type="discussion">
      <title>Discussion</title>
      <sec>
        <title>Principal Findings</title>
        <p>Our study sheds light on an important gap in existing cardiac ML research, with significant implications for digital health equity. We find that the majority of published ML studies predicting HF fail to acknowledge the underrepresentation of female patients in their data sets and do not perform stratified model evaluations, thus failing to assess sex disparities in algorithmic performance. Our secondary evaluation of 2 cardiac data sets exposed a neglected sex disparity in model performance, highlighting the importance of integrating these methods into future studies that use ML methods for cardiac modeling. In our approach, we identified several potential sources of algorithmic bias.</p>
        <p>First, we detected the underrepresentation of female patients in training data sets that may produce inequalities in model fidelity. Despite introducing oversampling techniques to address this omission, the disparities in performance persisted suggesting that addressing data set representation alone is not a sufficient measure for mitigating bias. Further, our experiments demonstrated that oversampling could reduce overall performance, which may result from the mixing of conflicting data (ie, male vs female feature rankings). In addition, oversampling with synthetic instances solely from the data set at hand does not provide the machine with more information, it simply redirects attention and therefore cannot easily compensate for demographic underrepresentation [<xref ref-type="bibr" rid="ref33">33</xref>]. When balancing the data set, our methods did not include undersampling due to our small data sets, however, this may be a potential avenue for future research.</p>
        <p>Second, we considered featurization and highlighted sex differences in the biochemical manifestation of disease. In current clinical practice, the diagnostic parameters used for identifying pathology are drawn from research trials dominated by male physiology: it is perhaps unsurprising therefore that algorithms built from these data tend to underperform in female disease. There is a growing body of research that critiques the use of unisex thresholds in medicine for biochemical tests; our sex-stratified analysis of the cardiac data sets and the identified sex differences in feature rankings supports these proposals [<xref ref-type="bibr" rid="ref16">16</xref>].</p>
        <p>There are further sources of inequitable performance that our evaluation cannot distinguish between. It may be that the sex differences in the physiological expression of disease mean that the prediction is harder to extract from 1 population. As a result, 1 sex may require more complex models than another, with differing architecture and degrees of flexibility. It may also simply be that there are differences in the predictability of 1 group compared with another, such that if the physiology of 1 group is more opaque, it may ultimately not be possible to resolve the observed disparities. McCradden and colleagues [<xref ref-type="bibr" rid="ref34">34</xref>] detail this challenge further in their review, highlighting that differences across groups may not always indicate inequity. There are complex causal relationships between biological, environmental, and social factors that underpin the differences in disease rates seen across population subgroups [<xref ref-type="bibr" rid="ref34">34</xref>]. While models must not promote different standards of care according to protected characteristics, differences between groups may not necessarily reflect discriminatory practice [<xref ref-type="bibr" rid="ref34">34</xref>].</p>
        <p>Our research was limited by the available information in the data sets. The absence of race or ethnicity data precluded the evaluation of their effects. Furthermore, the absence of other demographic data in the studies we identified prevented the investigation of health inequities that might impact the LGBTQ+ (lesbian, gay, bisexual, transgender, queer) community, disadvantaged socioeconomic groups, or other subgroups. Previous research has described historic and institutional biases that contribute to worse health outcomes for these groups, and evolving AI systems require the same scrutiny to ensure these harms do not become embedded within digital systems [<xref ref-type="bibr" rid="ref35">35</xref>-<xref ref-type="bibr" rid="ref37">37</xref>].</p>
        <p>Throughout this paper, we have used the terms male and female to reference biological sex, so as not to conflate sex and gender. With the ongoing problematic conflation of sex and gender in medicine, stratification of model performance by either sex or gender is often impossible, which was noted in our own work [<xref ref-type="bibr" rid="ref35">35</xref>-<xref ref-type="bibr" rid="ref37">37</xref>]. Beyond the features discussed above, there is a wide range of additional factors that we cannot account for. For example, creatinine phosphokinase was a key feature in HF modeling yet existing studies have demonstrated the variation in these levels for manual laborers and athletes, illustrating how occupation may impact a patient’s physiology [<xref ref-type="bibr" rid="ref38">38</xref>].</p>
        <p>To account for the complex interactions that potentiate disease, and the heterogeneous nature of patient cohorts, we require more complex modeling capable of capturing the full range of intersecting factors influencing patient health (eg, sex differences may be mediated by income). Unsupervised high-dimensional representation learning may be the path forward for this purpose [<xref ref-type="bibr" rid="ref39">39</xref>]. In addition to improving representation, unsupervised techniques enable us to detect neglected subpopulations without predetermining a characteristic of interest, facilitating the identification of the previously overlooked disadvantaged. In this sense, AI may provide a route forward to uncovering and addressing bias, by deploying more complex modeling that can improve patient representation and by revealing previously neglected disparities in the provision of care.</p>
      </sec>
      <sec>
        <title>Conclusions and Limitations</title>
        <p>In our paper, we have identified inequities in the performance of cardiac ML algorithms. Our findings are limited by the small size of the uncovered data sets, reducing their potential generalizability, and hence we propose that larger studies focused on this issue are required. These data sets also came from the same source, as we found a limited number of open-access databases due to the confidential nature of patient data and issues of proprietary ownership. In addition, we focused on RF models to replicate the papers uncovered in our literature search; however, ML models may differ in their degrees of performance disparity, and an evaluation across the range of ML model options is an important next step.</p>
        <p>In our paper we did not attempt to solve bias; instead, we highlighted a problem that exists throughout cardiology that requires further attention. The issue we have identified in these ML models is a foundational problem across medical modeling, in any instance where the use of an “average” is applied to a diverse population. It is possible that unsupervised ML and complex representational modeling may be a route forward for capturing heterogeneity in a previously unattainable manner and addressing issues of bias [<xref ref-type="bibr" rid="ref39">39</xref>]. Our findings demonstrate that examining performance inequities across demographic subgroups is an essential approach for identifying biases in AI and preventing the perpetuation of inequalities in digital health systems.</p>
      </sec>
    </sec>
  </body>
  <back>
    <app-group>
      <supplementary-material id="app1">
        <label>Multimedia Appendix 1</label>
        <p>Details of literature search and data sets.</p>
        <media xlink:href="jmir_v26i1e46936_app1.pdf" xlink:title="PDF File  (Adobe PDF File), 264 KB"/>
      </supplementary-material>
      <supplementary-material id="app2">
        <label>Multimedia Appendix 2</label>
        <p>Details of Fair Adversarial Gradient Tree Boosting.</p>
        <media xlink:href="jmir_v26i1e46936_app2.pdf" xlink:title="PDF File  (Adobe PDF File), 92 KB"/>
      </supplementary-material>
    </app-group>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">AI</term>
          <def>
            <p>artificial intelligence</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb2">CAD</term>
          <def>
            <p>coronary artery disease</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb3">DispFNR</term>
          <def>
            <p>disparity in false negative rate</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb4">EF</term>
          <def>
            <p>ejection fraction</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb5">FAGTB</term>
          <def>
            <p>Fair Adversarial Gradient Tree Boosting</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb6">FNR</term>
          <def>
            <p>false negative rate</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb7">FPR</term>
          <def>
            <p>false positive rate</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb8">HF</term>
          <def>
            <p>heart failure</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb9">LGBTQ+</term>
          <def>
            <p>lesbian, gay, bisexual, transgender, queer</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb10">ML</term>
          <def>
            <p>machine learning</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb11">PRISMA</term>
          <def>
            <p>Preferred Reporting Items for Systematic Reviews and Meta-Analyses</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb12">RF</term>
          <def>
            <p>random forest</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <ack>
      <p>The data sets analyzed during this study are publicly available. Data set 1 is available from the University of California Irvine Machine Learning Repository [<xref ref-type="bibr" rid="ref21">21</xref>]. Data set 2 is available from the IEEE Dataport Repository [<xref ref-type="bibr" rid="ref22">22</xref>]. This work was supported by UK Research and Innovation (UKRI; EP/S021612/1).</p>
    </ack>
    <fn-group>
      <fn fn-type="conflict">
        <p>None declared.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>O'Neil</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <source>Weapons of math destruction: how big data increases inequality and threatens democracy</source>
          <year>2017</year>
          <publisher-loc>New York City, U.S</publisher-loc>
          <publisher-name>Crown</publisher-name>
        </nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Daneshjou</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Vodrahalli</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Novoa</surname>
              <given-names>RA</given-names>
            </name>
            <name name-style="western">
              <surname>Jenkins</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Liang</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Rotemberg</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Ko</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Swetter</surname>
              <given-names>SM</given-names>
            </name>
            <name name-style="western">
              <surname>Bailey</surname>
              <given-names>EE</given-names>
            </name>
            <name name-style="western">
              <surname>Gevaert</surname>
              <given-names>O</given-names>
            </name>
            <name name-style="western">
              <surname>Mukherjee</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Phung</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Yekrang</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Fong</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Sahasrabudhe</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Allerup</surname>
              <given-names>JAC</given-names>
            </name>
            <name name-style="western">
              <surname>Okata-Karigane</surname>
              <given-names>U</given-names>
            </name>
            <name name-style="western">
              <surname>Zou</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Chiou</surname>
              <given-names>AS</given-names>
            </name>
          </person-group>
          <article-title>Disparities in dermatology AI performance on a diverse, curated clinical image set</article-title>
          <source>Sci Adv</source>
          <year>2022</year>
          <volume>8</volume>
          <issue>32</issue>
          <fpage>eabq6147</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.science.org/doi/10.1126/sciadv.abq6147"/>
          </comment>
          <pub-id pub-id-type="doi">10.1126/sciadv.abq6147</pub-id>
          <pub-id pub-id-type="medline">35960806</pub-id>
          <pub-id pub-id-type="pmcid">PMC9374341</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Seyyed-Kalantari</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>McDermott</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>IY</given-names>
            </name>
            <name name-style="western">
              <surname>Ghassemi</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>CheXclusion: fairness gaps in deep chest X-ray classifiers</article-title>
          <source>Biocomputing</source>
          <year>2021</year>
          <fpage>232</fpage>
          <lpage>243</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://psb.stanford.edu/psb-online/proceedings/psb21/abstracts/2021_p232.html"/>
          </comment>
          <pub-id pub-id-type="doi">10.1142/9789811232701_0022</pub-id>
          <pub-id pub-id-type="medline">33691020</pub-id>
          <pub-id pub-id-type="pii">9789811232701_0022</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Thompson</surname>
              <given-names>HM</given-names>
            </name>
            <name name-style="western">
              <surname>Sharma</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Bhalla</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Boley</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>McCluskey</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Dligach</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Churpek</surname>
              <given-names>MM</given-names>
            </name>
            <name name-style="western">
              <surname>Karnik</surname>
              <given-names>NS</given-names>
            </name>
            <name name-style="western">
              <surname>Afshar</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Bias and fairness assessment of a natural language processing opioid misuse classifier: detection and mitigation of electronic health record data disadvantages across racial subgroups</article-title>
          <source>J Am Med Inform Assoc</source>
          <year>2021</year>
          <volume>28</volume>
          <issue>11</issue>
          <fpage>2393</fpage>
          <lpage>2403</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://academic.oup.com/jamia/article/28/11/2393/6349190?login=false"/>
          </comment>
          <pub-id pub-id-type="doi">10.1093/jamia/ocab148</pub-id>
          <pub-id pub-id-type="medline">34383925</pub-id>
          <pub-id pub-id-type="pii">6349190</pub-id>
          <pub-id pub-id-type="pmcid">PMC8510285</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Obermeyer</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Powers</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Vogeli</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Mullainathan</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Dissecting racial bias in an algorithm used to manage the health of populations</article-title>
          <source>Science</source>
          <year>2019</year>
          <volume>366</volume>
          <issue>6464</issue>
          <fpage>447</fpage>
          <lpage>453</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.science.org/doi/10.1126/science.aax2342"/>
          </comment>
          <pub-id pub-id-type="doi">10.1126/science.aax2342</pub-id>
          <pub-id pub-id-type="medline">31649194</pub-id>
          <pub-id pub-id-type="pii">366/6464/447</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref6">
        <label>6</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Cirillo</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Catuara-Solarz</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Morey</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Guney</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Subirats</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Mellino</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Gigante</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Valencia</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Rementeria</surname>
              <given-names>MJ</given-names>
            </name>
            <name name-style="western">
              <surname>Chadha</surname>
              <given-names>AS</given-names>
            </name>
            <name name-style="western">
              <surname>Mavridis</surname>
              <given-names>N</given-names>
            </name>
          </person-group>
          <article-title>Sex and gender differences and biases in artificial intelligence for biomedicine and healthcare</article-title>
          <source>NPJ Digit Med</source>
          <year>2020</year>
          <volume>3</volume>
          <fpage>81</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.nature.com/articles/s41746-020-0288-5"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/s41746-020-0288-5</pub-id>
          <pub-id pub-id-type="medline">32529043</pub-id>
          <pub-id pub-id-type="pii">288</pub-id>
          <pub-id pub-id-type="pmcid">PMC7264169</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref7">
        <label>7</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Hu</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Yeung</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Ho</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Dumontier</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Thoral</surname>
              <given-names>PJ</given-names>
            </name>
            <name name-style="western">
              <surname>Mao</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Cao</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Mark</surname>
              <given-names>RG</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Feng</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Celi</surname>
              <given-names>LA</given-names>
            </name>
          </person-group>
          <article-title>Illness severity assessment of older adults in critical illness using machine learning (ELDER-ICU): an international multicentre study with subgroup bias evaluation</article-title>
          <source>Lancet Digit Health</source>
          <year>2023</year>
          <volume>5</volume>
          <issue>10</issue>
          <fpage>e657</fpage>
          <lpage>e667</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.thelancet.com/journals/landig/article/PIIS2589-7500(23)00128-0/fulltext"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/S2589-7500(23)00128-0</pub-id>
          <pub-id pub-id-type="medline">37599147</pub-id>
          <pub-id pub-id-type="pii">S2589-7500(23)00128-0</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref8">
        <label>8</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Grari</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Ruf</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Lamprier</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Detyniecki</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Fair adversarial gradient tree boosting</article-title>
          <year>2019</year>
          <conf-name>2019 IEEE International Conference on Data Mining (ICDM)</conf-name>
          <conf-date>November 8-11, 2019</conf-date>
          <conf-loc>Beijing, China</conf-loc>
          <fpage>1060</fpage>
          <lpage>1065</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://ieeexplore.ieee.org/document/8970941"/>
          </comment>
          <pub-id pub-id-type="doi">10.1109/icdm.2019.00124</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref9">
        <label>9</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Savarese</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Becher</surname>
              <given-names>PM</given-names>
            </name>
            <name name-style="western">
              <surname>Lund</surname>
              <given-names>LH</given-names>
            </name>
            <name name-style="western">
              <surname>Seferovic</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Rosano</surname>
              <given-names>GMC</given-names>
            </name>
            <name name-style="western">
              <surname>Coats</surname>
              <given-names>AJS</given-names>
            </name>
          </person-group>
          <article-title>Global burden of heart failure: a comprehensive and updated review of epidemiology</article-title>
          <source>Cardiovasc Res</source>
          <year>2022</year>
          <volume>118</volume>
          <issue>17</issue>
          <fpage>3272</fpage>
          <lpage>3287</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://academic.oup.com/cardiovascres/article/118/17/3272/6527627?login=false"/>
          </comment>
          <pub-id pub-id-type="doi">10.1093/cvr/cvac013</pub-id>
          <pub-id pub-id-type="medline">35150240</pub-id>
          <pub-id pub-id-type="pii">6527627</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref10">
        <label>10</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Goldraich</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Beck-da-Silva</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Clausell</surname>
              <given-names>N</given-names>
            </name>
          </person-group>
          <article-title>Are scores useful in advanced heart failure?</article-title>
          <source>Expert Rev Cardiovasc Ther</source>
          <year>2009</year>
          <volume>7</volume>
          <issue>8</issue>
          <fpage>985</fpage>
          <lpage>997</lpage>
          <pub-id pub-id-type="doi">10.1586/erc.09.68</pub-id>
          <pub-id pub-id-type="medline">19673676</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref11">
        <label>11</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Treece</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Chemchirian</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Hamilton</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Jbara</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Gangadharan</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Paul</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Baumrucker</surname>
              <given-names>SJ</given-names>
            </name>
          </person-group>
          <article-title>A review of prognostic tools in heart failure</article-title>
          <source>Am J Hosp Palliat Med</source>
          <year>2018</year>
          <volume>35</volume>
          <issue>3</issue>
          <fpage>514</fpage>
          <lpage>522</lpage>
          <pub-id pub-id-type="doi">10.1177/1049909117709468</pub-id>
          <pub-id pub-id-type="medline">28554221</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref12">
        <label>12</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Thorvaldsen</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Benson</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Ståhlberg</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Dahlström</surname>
              <given-names>U</given-names>
            </name>
            <name name-style="western">
              <surname>Edner</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Lund</surname>
              <given-names>LH</given-names>
            </name>
          </person-group>
          <article-title>Triage of patients with moderate to severe heart failure: who should be referred to a heart failure center?</article-title>
          <source>J Am Coll Cardiol</source>
          <year>2014</year>
          <volume>63</volume>
          <issue>7</issue>
          <fpage>661</fpage>
          <lpage>671</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.sciencedirect.com/science/article/pii/S0735109713057537?via%3Dihub"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.jacc.2013.10.017</pub-id>
          <pub-id pub-id-type="medline">24161453</pub-id>
          <pub-id pub-id-type="pii">S0735-1097(13)05753-7</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref13">
        <label>13</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Escamilla</surname>
              <given-names>AKG</given-names>
            </name>
            <name name-style="western">
              <surname>Hassani</surname>
              <given-names>AHE</given-names>
            </name>
            <name name-style="western">
              <surname>Andres</surname>
              <given-names>E</given-names>
            </name>
          </person-group>
          <article-title>A comparison of machine learning techniques to predict the risk of heart failure</article-title>
          <source>Machine Learning Paradigms: Applications of Learning and Analytics in Intelligent Systems</source>
          <year>2019</year>
          <publisher-loc>Switzerland AG</publisher-loc>
          <publisher-name>Springer</publisher-name>
          <fpage>9</fpage>
          <lpage>26</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref14">
        <label>14</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Sullivan</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Doumouras</surname>
              <given-names>BS</given-names>
            </name>
            <name name-style="western">
              <surname>Santema</surname>
              <given-names>BT</given-names>
            </name>
            <name name-style="western">
              <surname>Walsh</surname>
              <given-names>MN</given-names>
            </name>
            <name name-style="western">
              <surname>Douglas</surname>
              <given-names>PS</given-names>
            </name>
            <name name-style="western">
              <surname>Voors</surname>
              <given-names>AA</given-names>
            </name>
            <name name-style="western">
              <surname>Van Spall</surname>
              <given-names>HGC</given-names>
            </name>
          </person-group>
          <article-title>Sex-specific differences in heart failure: pathophysiology, risk factors, management, and outcomes</article-title>
          <source>Can J Cardiol</source>
          <year>2021</year>
          <volume>37</volume>
          <issue>4</issue>
          <fpage>560</fpage>
          <lpage>571</lpage>
          <pub-id pub-id-type="doi">10.1016/j.cjca.2020.12.025</pub-id>
          <pub-id pub-id-type="medline">33383166</pub-id>
          <pub-id pub-id-type="pii">S0828-282X(20)31196-X</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref15">
        <label>15</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Walsh</surname>
              <given-names>MN</given-names>
            </name>
            <name name-style="western">
              <surname>Jessup</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Lindenfeld</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Women with heart failure: unheard, untreated, and unstudied</article-title>
          <source>J Am Coll Cardiol</source>
          <year>2019</year>
          <volume>73</volume>
          <issue>1</issue>
          <fpage>41</fpage>
          <lpage>43</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.sciencedirect.com/science/article/pii/S0735109718389800?via%3Dihub"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.jacc.2018.10.041</pub-id>
          <pub-id pub-id-type="medline">30621949</pub-id>
          <pub-id pub-id-type="pii">S0735-1097(18)38980-0</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref16">
        <label>16</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Sobhani</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Castro</surname>
              <given-names>DKN</given-names>
            </name>
            <name name-style="western">
              <surname>Fu</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Gottlieb</surname>
              <given-names>RA</given-names>
            </name>
            <name name-style="western">
              <surname>Van Eyk</surname>
              <given-names>JE</given-names>
            </name>
            <name name-style="western">
              <surname>Merz</surname>
              <given-names>CNB</given-names>
            </name>
          </person-group>
          <article-title>Sex differences in ischemic heart disease and heart failure biomarkers</article-title>
          <source>Biol Sex Differ</source>
          <year>2018</year>
          <volume>9</volume>
          <issue>1</issue>
          <fpage>43</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://bsd.biomedcentral.com/articles/10.1186/s13293-018-0201-y"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/s13293-018-0201-y</pub-id>
          <pub-id pub-id-type="medline">30223899</pub-id>
          <pub-id pub-id-type="pii">10.1186/s13293-018-0201-y</pub-id>
          <pub-id pub-id-type="pmcid">PMC6142320</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref17">
        <label>17</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Straw</surname>
              <given-names>I</given-names>
            </name>
          </person-group>
          <article-title>The automation of bias in medical Artificial Intelligence (AI): decoding the past to create a better future</article-title>
          <source>Artif Intell Med</source>
          <year>2020</year>
          <volume>110</volume>
          <fpage>101965</fpage>
          <pub-id pub-id-type="doi">10.1016/j.artmed.2020.101965</pub-id>
          <pub-id pub-id-type="medline">33250145</pub-id>
          <pub-id pub-id-type="pii">S0933-3657(20)31230-6</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref18">
        <label>18</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hamberg</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>Gender bias in medicine</article-title>
          <source>Womens Health (Lond)</source>
          <year>2008</year>
          <volume>4</volume>
          <issue>3</issue>
          <fpage>237</fpage>
          <lpage>243</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://journals.sagepub.com/doi/10.2217/17455057.4.3.237"/>
          </comment>
          <pub-id pub-id-type="doi">10.2217/17455057.4.3.237</pub-id>
          <pub-id pub-id-type="medline">19072473</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref19">
        <label>19</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Krieger</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Fee</surname>
              <given-names>E</given-names>
            </name>
          </person-group>
          <article-title>Man-made medicine and women's health: the biopolitics of sex/gender and race/ethnicity</article-title>
          <source>Int J Health Serv</source>
          <year>1994</year>
          <volume>24</volume>
          <issue>2</issue>
          <fpage>265</fpage>
          <lpage>283</lpage>
          <pub-id pub-id-type="doi">10.2190/LWLH-NMCJ-UACL-U80Y</pub-id>
          <pub-id pub-id-type="medline">8034393</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref20">
        <label>20</label>
        <nlm-citation citation-type="web">
          <article-title>PRISMA flow diagram</article-title>
          <source>PRISMA</source>
          <access-date>2024-07-09</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.prisma-statement.org/prisma-2020-flow-diagram">https://www.prisma-statement.org/prisma-2020-flow-diagram</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref21">
        <label>21</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Tanvir</surname>
              <given-names>AAM</given-names>
            </name>
            <name name-style="western">
              <surname>Bhatti</surname>
              <given-names>SH</given-names>
            </name>
            <name name-style="western">
              <surname>Aftab</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Raza</surname>
              <given-names>MA</given-names>
            </name>
          </person-group>
          <article-title>Heart failure clinical records data set F</article-title>
          <source>University of California Irvine Machine Learning Repository</source>
          <year>2020</year>
          <access-date>2024-05-17</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://archive.ics.uci.edu/dataset/519/heart+failure+clinical+records">https://archive.ics.uci.edu/dataset/519/heart+failure+clinical+records</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref22">
        <label>22</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Siddhartha</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Heart disease dataset (comprehensive)</article-title>
          <source>IEEE Dataport</source>
          <year>2020</year>
          <access-date>2024-05-17</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://ieee-dataport.org/open-access/heart-disease-dataset-comprehensive">https://ieee-dataport.org/open-access/heart-disease-dataset-comprehensive</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref23">
        <label>23</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Chicco</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Jurman</surname>
              <given-names>G</given-names>
            </name>
          </person-group>
          <article-title>Machine learning can predict survival of patients with heart failure from serum creatinine and ejection fraction alone</article-title>
          <source>BMC Med Inform Decis Mak</source>
          <year>2020</year>
          <volume>20</volume>
          <issue>1</issue>
          <fpage>16</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://bmcmedinformdecismak.biomedcentral.com/articles/10.1186/s12911-020-1023-5"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/s12911-020-1023-5</pub-id>
          <pub-id pub-id-type="medline">32013925</pub-id>
          <pub-id pub-id-type="pii">10.1186/s12911-020-1023-5</pub-id>
          <pub-id pub-id-type="pmcid">PMC6998201</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref24">
        <label>24</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Buolamwini</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Gebru</surname>
              <given-names>T</given-names>
            </name>
          </person-group>
          <article-title>Gender shades: intersectional accuracy disparities in commercial gender classification</article-title>
          <year>2018</year>
          <conf-name>1st Conference on Fairness, Accountability and Transparency, PMLR 81</conf-name>
          <conf-date>February 23-24, 2018</conf-date>
          <conf-loc>New York, NY</conf-loc>
          <fpage>77</fpage>
          <lpage>91</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://proceedings.mlr.press/v81/buolamwini18a.html"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref25">
        <label>25</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Mishra</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Pandey</surname>
              <given-names>CM</given-names>
            </name>
            <name name-style="western">
              <surname>Singh</surname>
              <given-names>U</given-names>
            </name>
            <name name-style="western">
              <surname>Gupta</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Sahu</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Keshri</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Descriptive statistics and normality tests for statistical data</article-title>
          <source>Ann Card Anaesth</source>
          <year>2019</year>
          <volume>22</volume>
          <issue>1</issue>
          <fpage>67</fpage>
          <lpage>72</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://journals.lww.com/aoca/fulltext/2019/22010/descriptive_statistics_and_normality_tests_for.11.aspx"/>
          </comment>
          <pub-id pub-id-type="doi">10.4103/aca.ACA_157_18</pub-id>
          <pub-id pub-id-type="medline">30648682</pub-id>
          <pub-id pub-id-type="pii">AnnCardAnaesth_2019_22_1_67_250184</pub-id>
          <pub-id pub-id-type="pmcid">PMC6350423</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref26">
        <label>26</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Afrose</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Song</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Nemeroff</surname>
              <given-names>CB</given-names>
            </name>
            <name name-style="western">
              <surname>Lu</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Yao</surname>
              <given-names>DD</given-names>
            </name>
          </person-group>
          <article-title>Subpopulation-specific machine learning prognosis for underrepresented patients with double prioritized bias correction</article-title>
          <source>Commun Med (Lond)</source>
          <year>2022</year>
          <volume>2</volume>
          <fpage>111</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.nature.com/articles/s43856-022-00165-w"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/s43856-022-00165-w</pub-id>
          <pub-id pub-id-type="medline">36059892</pub-id>
          <pub-id pub-id-type="pii">165</pub-id>
          <pub-id pub-id-type="pmcid">PMC9436942</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref27">
        <label>27</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Chawla</surname>
              <given-names>NV</given-names>
            </name>
            <name name-style="western">
              <surname>Bowyer</surname>
              <given-names>KW</given-names>
            </name>
            <name name-style="western">
              <surname>Hall</surname>
              <given-names>LO</given-names>
            </name>
            <name name-style="western">
              <surname>Kegelmeyer</surname>
              <given-names>WP</given-names>
            </name>
          </person-group>
          <article-title>SMOTE: synthetic minority over-sampling technique</article-title>
          <source>J Artif Intell Res</source>
          <year>2002</year>
          <volume>16</volume>
          <issue>3</issue>
          <fpage>321</fpage>
          <lpage>357</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1613/jair.953"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/s0887-6185(02)00103-2</pub-id>
          <pub-id pub-id-type="medline">12214817</pub-id>
          <pub-id pub-id-type="pii">S0887-6185(02)00103-2</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref28">
        <label>28</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Islam</surname>
              <given-names>SR</given-names>
            </name>
            <name name-style="western">
              <surname>Eberle</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Ghafoor</surname>
              <given-names>SK</given-names>
            </name>
            <name name-style="western">
              <surname>Ahmed</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Explainable artificial intelligence approaches: a survey</article-title>
          <source>arXiv</source>
          <comment>Preprint posted online on January 23, 2021</comment>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/2101.09429"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref29">
        <label>29</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lundberg</surname>
              <given-names>SM</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>SI</given-names>
            </name>
          </person-group>
          <article-title>A unified approach to interpreting model predictions</article-title>
          <year>2017</year>
          <conf-name>NIPS'17: 31st International Conference on Neural Information Processing Systems</conf-name>
          <conf-date>December 4-9, 2017</conf-date>
          <conf-loc>Long Beach, CA</conf-loc>
          <fpage>4768</fpage>
          <lpage>4777</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://dl.acm.org/doi/proceedings/10.5555/3295222"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref30">
        <label>30</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Borgese</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Joyce</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Anderson</surname>
              <given-names>EE</given-names>
            </name>
            <name name-style="western">
              <surname>Churpek</surname>
              <given-names>MM</given-names>
            </name>
            <name name-style="western">
              <surname>Afshar</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Bias assessment and correction in machine learning algorithms: a use-case in a natural language processing algorithm to identify hospitalized patients with unhealthy alcohol use</article-title>
          <source>AMIA Annu Symp Proc</source>
          <year>2022</year>
          <volume>2021</volume>
          <fpage>247</fpage>
          <lpage>254</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/35308909"/>
          </comment>
          <pub-id pub-id-type="medline">35308909</pub-id>
          <pub-id pub-id-type="pii">3575666</pub-id>
          <pub-id pub-id-type="pmcid">PMC8861719</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref31">
        <label>31</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Allen</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Mataraso</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Siefkas</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Burdick</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Braden</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Dellinger</surname>
              <given-names>RP</given-names>
            </name>
            <name name-style="western">
              <surname>McCoy</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Pellegrini</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Hoffman</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Green-Saxena</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Barnes</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Calvert</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Das</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>A racially unbiased, machine learning approach to prediction of mortality: algorithm development study</article-title>
          <source>JMIR Public Health Surveill</source>
          <year>2020</year>
          <volume>6</volume>
          <issue>4</issue>
          <fpage>e22400</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://publichealth.jmir.org/2020/4/e22400/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/22400</pub-id>
          <pub-id pub-id-type="medline">33090117</pub-id>
          <pub-id pub-id-type="pii">v6i4e22400</pub-id>
          <pub-id pub-id-type="pmcid">PMC7644374</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref32">
        <label>32</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Tison</surname>
              <given-names>GH</given-names>
            </name>
            <name name-style="western">
              <surname>Avram</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Nah</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Klein</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Howard</surname>
              <given-names>BV</given-names>
            </name>
            <name name-style="western">
              <surname>Allison</surname>
              <given-names>MA</given-names>
            </name>
            <name name-style="western">
              <surname>Casanova</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Blair</surname>
              <given-names>RH</given-names>
            </name>
            <name name-style="western">
              <surname>Breathett</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Foraker</surname>
              <given-names>RE</given-names>
            </name>
            <name name-style="western">
              <surname>Olgin</surname>
              <given-names>JE</given-names>
            </name>
            <name name-style="western">
              <surname>Parikh</surname>
              <given-names>NI</given-names>
            </name>
          </person-group>
          <article-title>Predicting incident heart failure in women with machine learning: the women's health initiative cohort</article-title>
          <source>Can J Cardiol</source>
          <year>2021</year>
          <volume>37</volume>
          <issue>11</issue>
          <fpage>1708</fpage>
          <lpage>1714</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/34400272"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.cjca.2021.08.006</pub-id>
          <pub-id pub-id-type="medline">34400272</pub-id>
          <pub-id pub-id-type="pii">S0828-282X(21)00651-6</pub-id>
          <pub-id pub-id-type="pmcid">PMC8642266</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref33">
        <label>33</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Pombo</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Gray</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Cardoso</surname>
              <given-names>MJ</given-names>
            </name>
            <name name-style="western">
              <surname>Ourselin</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Rees</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Ashburner</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Nachev</surname>
              <given-names>P</given-names>
            </name>
          </person-group>
          <article-title>Equitable modelling of brain imaging by counterfactual augmentation with morphologically constrained 3D deep generative models</article-title>
          <source>Med Image Anal</source>
          <year>2023</year>
          <volume>84</volume>
          <fpage>102723</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.sciencedirect.com/science/article/pii/S1361841522003516?via%3Dihub"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.media.2022.102723</pub-id>
          <pub-id pub-id-type="medline">36542907</pub-id>
          <pub-id pub-id-type="pii">S1361-8415(22)00351-6</pub-id>
          <pub-id pub-id-type="pmcid">PMC10591114</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref34">
        <label>34</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>McCradden</surname>
              <given-names>MD</given-names>
            </name>
            <name name-style="western">
              <surname>Joshi</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Mazwi</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Anderson</surname>
              <given-names>JA</given-names>
            </name>
          </person-group>
          <article-title>Ethical limitations of algorithmic fairness solutions in health care machine learning</article-title>
          <source>Lancet Digit Health</source>
          <year>2020</year>
          <volume>2</volume>
          <issue>5</issue>
          <fpage>e221</fpage>
          <lpage>e223</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.thelancet.com/journals/landig/article/PIIS2589-7500(20)30065-0/fulltext"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/S2589-7500(20)30065-0</pub-id>
          <pub-id pub-id-type="medline">33328054</pub-id>
          <pub-id pub-id-type="pii">S2589-7500(20)30065-0</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref35">
        <label>35</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Safer</surname>
              <given-names>JD</given-names>
            </name>
            <name name-style="western">
              <surname>Coleman</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Feldman</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Garofalo</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Hembree</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Radix</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Sevelius</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Barriers to healthcare for transgender individuals</article-title>
          <source>Curr Opin Endocrinol Diabetes Obes</source>
          <year>2016</year>
          <volume>23</volume>
          <issue>2</issue>
          <fpage>168</fpage>
          <lpage>171</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/26910276"/>
          </comment>
          <pub-id pub-id-type="doi">10.1097/MED.0000000000000227</pub-id>
          <pub-id pub-id-type="medline">26910276</pub-id>
          <pub-id pub-id-type="pii">01266029-201604000-00014</pub-id>
          <pub-id pub-id-type="pmcid">PMC4802845</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref36">
        <label>36</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Rutherford</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Stark</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Ablona</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Klassen</surname>
              <given-names>BJ</given-names>
            </name>
            <name name-style="western">
              <surname>Higgins</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Jacobsen</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Draenos</surname>
              <given-names>CJ</given-names>
            </name>
            <name name-style="western">
              <surname>Card</surname>
              <given-names>KG</given-names>
            </name>
            <name name-style="western">
              <surname>Lachowsky</surname>
              <given-names>NJ</given-names>
            </name>
          </person-group>
          <article-title>Health and well-being of trans and non-binary participants in a community-based survey of gay, bisexual, and queer men, and non-binary and two-spirit people across Canada</article-title>
          <source>PLoS One</source>
          <year>2021</year>
          <volume>16</volume>
          <issue>2</issue>
          <fpage>e0246525</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0246525"/>
          </comment>
          <pub-id pub-id-type="doi">10.1371/journal.pone.0246525</pub-id>
          <pub-id pub-id-type="medline">33571252</pub-id>
          <pub-id pub-id-type="pii">PONE-D-20-30194</pub-id>
          <pub-id pub-id-type="pmcid">PMC7877578</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref37">
        <label>37</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Beckwith</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>McDowell</surname>
              <given-names>MJ</given-names>
            </name>
            <name name-style="western">
              <surname>Reisner</surname>
              <given-names>SL</given-names>
            </name>
            <name name-style="western">
              <surname>Zaslow</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Weiss</surname>
              <given-names>RD</given-names>
            </name>
            <name name-style="western">
              <surname>Mayer</surname>
              <given-names>KH</given-names>
            </name>
            <name name-style="western">
              <surname>Keuroghlian</surname>
              <given-names>AS</given-names>
            </name>
          </person-group>
          <article-title>Psychiatric epidemiology of transgender and nonbinary adult patients at an urban health center</article-title>
          <source>LGBT Health</source>
          <year>2019</year>
          <volume>6</volume>
          <issue>2</issue>
          <fpage>51</fpage>
          <lpage>61</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/30707624"/>
          </comment>
          <pub-id pub-id-type="doi">10.1089/lgbt.2018.0136</pub-id>
          <pub-id pub-id-type="medline">30707624</pub-id>
          <pub-id pub-id-type="pmcid">PMC6434596</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref38">
        <label>38</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Vejjajiva</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Teasdale</surname>
              <given-names>GM</given-names>
            </name>
          </person-group>
          <article-title>Serum creatine kinase and physical exercise</article-title>
          <source>Br Med J</source>
          <year>1965</year>
          <volume>1</volume>
          <issue>5451</issue>
          <fpage>1653</fpage>
          <lpage>1654</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.bmj.com/content/1/5451/1653"/>
          </comment>
          <pub-id pub-id-type="doi">10.1136/bmj.1.5451.1653</pub-id>
          <pub-id pub-id-type="medline">14295325</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref39">
        <label>39</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Carruthers</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Straw</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Ruffle</surname>
              <given-names>JK</given-names>
            </name>
            <name name-style="western">
              <surname>Herron</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Nelson</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Bzdok</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Fernandez-Reyes</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Rees</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Nachev</surname>
              <given-names>P</given-names>
            </name>
          </person-group>
          <article-title>Representational ethical model calibration</article-title>
          <source>NPJ Digit Med</source>
          <year>2022</year>
          <volume>5</volume>
          <issue>1</issue>
          <fpage>170</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.nature.com/articles/s41746-022-00716-4"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/s41746-022-00716-4</pub-id>
          <pub-id pub-id-type="medline">36333390</pub-id>
          <pub-id pub-id-type="pii">10.1038/s41746-022-00716-4</pub-id>
          <pub-id pub-id-type="pmcid">PMC9636204</pub-id>
        </nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
