<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article" dtd-version="2.0">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JMIR</journal-id>
      <journal-id journal-id-type="nlm-ta">J Med Internet Res</journal-id>
      <journal-title>Journal of Medical Internet Research</journal-title>
      <issn pub-type="epub">1438-8871</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
      <article-id pub-id-type="publisher-id">v28i1e95452</article-id>
      <article-id pub-id-type="pmid">42297359</article-id>
      <article-id pub-id-type="doi">10.2196/95452</article-id>
      <article-categories>
        <subj-group subj-group-type="heading">
          <subject>Viewpoint</subject>
        </subj-group>
        <subj-group subj-group-type="article-type">
          <subject>Viewpoint</subject>
        </subj-group>
      </article-categories>
      <title-group>
        <article-title>Training AI Models for Aesthetic Facial Evaluation: Focused Review and Framework to Mitigate Homogenizing Bias</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="editor">
          <name>
            <surname>Coristine</surname>
            <given-names>Andrew</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Hu</surname>
            <given-names>Yihan</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Patil</surname>
            <given-names>Vikas</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Ilodigwe</surname>
            <given-names>Lucky</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib id="contrib1" contrib-type="author" corresp="yes">
          <name name-style="western">
            <surname>Kumar</surname>
            <given-names>Anisha R</given-names>
          </name>
          <degrees>MD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <address>
            <institution>Division of Otolaryngology, Department of Surgery</institution>
            <institution>Stony Brook University</institution>
            <addr-line>101 Nicolls Road</addr-line>
            <addr-line>Stony Brook, NY, 11790</addr-line>
            <country>United States</country>
            <phone>1 (631) 444 7875</phone>
            <email>arkumar@post.harvard.edu</email>
          </address>
          <xref rid="aff2" ref-type="aff">2</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-7067-8891</ext-link>
        </contrib>
        <contrib id="contrib2" contrib-type="author">
          <name name-style="western">
            <surname>Varshney</surname>
            <given-names>Lav R</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff2" ref-type="aff">2</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-2798-5308</ext-link>
        </contrib>
      </contrib-group>
      <aff id="aff1">
        <label>1</label>
        <institution>Division of Otolaryngology, Department of Surgery</institution>
        <institution>Stony Brook University</institution>
        <addr-line>Stony Brook, NY</addr-line>
        <country>United States</country>
      </aff>
      <aff id="aff2">
        <label>2</label>
        <institution>AI Innovation Institute</institution>
        <institution>Stony Brook University</institution>
        <addr-line>Stony Brook, NY</addr-line>
        <country>United States</country>
      </aff>
      <author-notes>
        <corresp>Corresponding Author: Anisha R Kumar <email>arkumar@post.harvard.edu</email></corresp>
      </author-notes>
      <pub-date pub-type="collection">
        <year>2026</year>
      </pub-date>
      <pub-date pub-type="epub">
        <day>15</day>
        <month>6</month>
        <year>2026</year>
      </pub-date>
      <volume>28</volume>
      <elocation-id>e95452</elocation-id>
      <history>
        <date date-type="received">
          <day>16</day>
          <month>3</month>
          <year>2026</year>
        </date>
        <date date-type="rev-request">
          <day>16</day>
          <month>4</month>
          <year>2026</year>
        </date>
        <date date-type="accepted">
          <day>31</day>
          <month>5</month>
          <year>2026</year>
        </date>
      </history>
      <copyright-statement>©Anisha R Kumar, Lav R Varshney. Originally published in the Journal of Medical Internet Research (https://www.jmir.org), 15.06.2026.</copyright-statement>
      <copyright-year>2026</copyright-year>
      <license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
        <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (https://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in the Journal of Medical Internet Research (ISSN 1438-8871), is properly cited. The complete bibliographic information, a link to the original publication on https://www.jmir.org/, as well as this copyright and license information must be included.</p>
      </license>
      <self-uri xlink:href="https://www.jmir.org/2026/1/e95452" xlink:type="simple"/>
      <abstract>
        <p>As artificial intelligence (AI) models become increasingly integrated into facial aesthetic surgery for attractiveness prediction and surgical outcome simulation, their potential to perpetuate bias poses clinical concerns. Current models trained on limited datasets inaccurately evaluate underrepresented populations and risk promoting aesthetic homogenization that conflicts with patient goals of ethnic feature preservation. Drawing on current literature, this paper examines bias across AI development stages in aesthetic facial evaluation. Benchmark datasets such as SCUT-FBP (South China University of Technology—Facial Beauty Prediction) and the Chicago Face Database underrepresent older adults, non-White, and ethnically diverse populations. Training methodologies lack fairness-aware techniques, and evaluation focuses on overall rather than demographic-stratified accuracy. While individual mitigation strategies exist—including balanced datasets, adversarial debiasing, and fairness metrics—no comprehensive framework integrates these approaches across the entire development lifecycle. We propose a 6-pillar framework spanning the AI development lifecycle: (1) diverse data collection with synthetic augmentation, (2) fairness-aware training techniques, (3) complementary fairness metrics with intersectional assessment, (4) explainable AI for clinical transparency, (5) stakeholder engagement, and (6) continuous monitoring. Despite the challenges of maintaining algorithmic standardization and cultural specificity, this framework provides implementation guidance for AI developers, clinicians, and institutions, with principles applicable beyond aesthetic surgery to broader facial analysis applications.</p>
      </abstract>
      <kwd-group>
        <kwd>aesthetics</kwd>
        <kwd>artificial intelligence</kwd>
        <kwd>fairness</kwd>
        <kwd>framework</kwd>
        <kwd>governance</kwd>
        <kwd>machine learning</kwd>
        <kwd>medical informatics</kwd>
        <kwd>model development</kwd>
        <kwd>plastic surgery</kwd>
        <kwd>surgery</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <sec sec-type="introduction">
      <title>Introduction</title>
      <p>Artificial intelligence (AI) is increasingly integrated into preoperative planning and outcome simulation in facial aesthetic surgery [<xref ref-type="bibr" rid="ref1">1</xref>,<xref ref-type="bibr" rid="ref2">2</xref>], typically based on modern machine learning (ML) techniques. Current applications include prediction of attractiveness, simulation of surgical outcomes, and patient assessments [<xref ref-type="bibr" rid="ref3">3</xref>], offering the potential for objective, standardized aesthetic evaluation [<xref ref-type="bibr" rid="ref4">4</xref>]. Facial plastic surgeons have preliminarily assessed how established AI-based websites compare to human scoring of facial attractiveness [<xref ref-type="bibr" rid="ref5">5</xref>], yet standardized and validated AI models for facial aesthetic evaluation have not been established. Before widespread clinical integration, surgeons must understand how these models are trained to ensure accurate, culturally diverse evaluations and avoid perpetuating bias.</p>
      <p>The technical challenge of AI model training is compounded by the fact that beauty is a cultural construct rather than a universal fact [<xref ref-type="bibr" rid="ref6">6</xref>]. Recent large-scale cross-cultural research using geometric morphometrics and Bayesian analysis of 1550 faces from 10 global populations reveals this complexity: distinctiveness (deviation from average facial proportions) negatively affects attractiveness perception universally, and femininity positively influences attractiveness assessments of female faces across all studied populations [<xref ref-type="bibr" rid="ref7">7</xref>]. However, 2 traditionally emphasized features showed no robust effects: facial symmetry had no significant association with attractiveness ratings, and masculinity did not consistently influence attractiveness judgments of male faces [<xref ref-type="bibr" rid="ref7">7</xref>]. These findings challenge conventional assumptions about which features are genuinely universal versus culturally variable.</p>
      <p>These universal principles interact with culture-specific preferences in complex ways. Skin coloration demonstrates culturally modulated aesthetic judgments: lighter skin tones associate with attractiveness among Chinese observers judging own-ethnicity faces, whereas European observers associate warmer yellow skin tones with attractiveness in Chinese faces [<xref ref-type="bibr" rid="ref8">8</xref>]. Cross-cultural studies comparing Japanese and American raters reveal that while overall attractiveness ratings correlate across cultures, specific features driving these judgments differ: Japanese raters emphasize raised eyebrows in attractive male faces and smaller mouths in attractive female faces more than American raters [<xref ref-type="bibr" rid="ref9">9</xref>]. Similarly, considerable cross-cultural agreement exists regarding Vietnamese facial attractiveness, yet Czech European raters associate attractiveness with averageness significantly more than Vietnamese raters [<xref ref-type="bibr" rid="ref10">10</xref>]. These findings illustrate that while certain structural features (averageness and femininity) demonstrate universal appeal, the relative importance and specific manifestations of these features vary across cultural contexts.</p>
      <p>Historically, plastic surgery has relied on Western aesthetic standards [<xref ref-type="bibr" rid="ref11">11</xref>], but there is growing patient diversity and a shift toward preservation of features that convey ethnic identity [<xref ref-type="bibr" rid="ref12">12</xref>,<xref ref-type="bibr" rid="ref13">13</xref>]. An inherent tension, therefore, exists in training AI models: they must account for genuinely universal principles (such as averageness and femininity in female faces) while also recognizing culture-specific preferences and avoiding the imposition of Eurocentric standards on features that demonstrate regional variation [<xref ref-type="bibr" rid="ref4">4</xref>].</p>
      <p>Furthermore, AI models trained on biased datasets may perpetuate narrow beauty ideals and inaccurate representations of patient populations who vary across race, ethnicity, nationality, language, socioeconomic background, gender, and age [<xref ref-type="bibr" rid="ref14">14</xref>,<xref ref-type="bibr" rid="ref15">15</xref>], dimensions that interact to produce compounded disadvantage at their intersections, creating a risk that algorithmic recommendations lead to the elimination of distinctive ethnic characteristics and aesthetic homogenization. Culturally responsive AI training frameworks are therefore needed [<xref ref-type="bibr" rid="ref4">4</xref>]; without them, a gap will persist between the technical capabilities of AI and its ethical implementation in comprehensive patient care [<xref ref-type="bibr" rid="ref16">16</xref>].</p>
      <p>In this paper, we aim to: (1) review AI training methodologies for aesthetic evaluation, (2) examine sources of bias in training, (3) evaluate current practices for mitigating bias, and (4) propose a framework for artificial intelligence/machine learning (AI/ML) training with recommendations for clinical implementation.</p>
    </sec>
    <sec>
      <title>Review of Current Training Methods</title>
      <p>Deep learning is a method within AI/ML that uses multilayer neural networks to process large amounts of data and extract complex patterns and features. Within this approach, convolutional neural networks (CNNs) are common for image recognition and processing [<xref ref-type="bibr" rid="ref16">16</xref>,<xref ref-type="bibr" rid="ref17">17</xref>]. Such ML models are trained using supervised learning based on human-rated attractiveness scores [<xref ref-type="bibr" rid="ref18">18</xref>,<xref ref-type="bibr" rid="ref19">19</xref>], geometric features including symmetry and the Golden Ratio [<xref ref-type="bibr" rid="ref20">20</xref>], the “rule of thirds” for frontal view analysis [<xref ref-type="bibr" rid="ref17">17</xref>,<xref ref-type="bibr" rid="ref20">20</xref>], and extracted facial proportions such as nasofrontal angle, nasolabial angle, and glabella-to-chin angle [<xref ref-type="bibr" rid="ref17">17</xref>].</p>
      <p>Model performance is evaluated using regression tasks, including the Pearson correlation coefficient, which measures the linear relationship between AI-predicted attractiveness scores and human ratings [<xref ref-type="bibr" rid="ref21">21</xref>,<xref ref-type="bibr" rid="ref22">22</xref>], and mean absolute error, which measures the average magnitude of prediction errors. These metrics validate how well the AI model replicates human aesthetic judgment. However, recent work in health-related AI has emphasized that evaluation should not rely on task performance metrics alone but also incorporate risk-oriented and context-sensitive assessment frameworks [<xref ref-type="bibr" rid="ref23">23</xref>].</p>
      <p>Beyond attractiveness prediction alone, AI models can simultaneously learn to perform related tasks, including age evaluation, gender identification, ethnicity and race recognition, and facial expression detection [<xref ref-type="bibr" rid="ref24">24</xref>]. This multitask learning (MTL) approach improves the model’s evaluative capacity across all tasks and develops a more comprehensive understanding of facial features that determine attractiveness. MTL also addresses dataset size limitations (SCUT-FBP [South China University of Technology—Facial Beauty Prediction]: 500 faces; SCUT-FBP5500: 5500 faces) by incorporating auxiliary tasks that provide additional training signals, leveraging natural connections between attractiveness and related attributes [<xref ref-type="bibr" rid="ref24">24</xref>].</p>
      <p>However, recent architectural innovations have moved beyond traditional CNNs to address dataset limitations. Hybrid approaches combining vision transformers and state-space models (such as Mamba) leverage complementary feature extraction capabilities [<xref ref-type="bibr" rid="ref25">25</xref>]. Vision transformers excel at capturing global facial structure and symmetry through attention mechanisms that process the entire face holistically, while state-space models efficiently model fine-grained local features such as skin quality and texture with linear computational complexity [<xref ref-type="bibr" rid="ref25">25</xref>].</p>
      <p>These architectures use self-supervised pretraining on large, diverse image datasets, followed by task-specific fine-tuning on smaller facial beauty datasets [<xref ref-type="bibr" rid="ref25">25</xref>]. This transfer learning approach addresses the fundamental challenge that aesthetic datasets, such as SCUT-FBP5500 with 5500 faces, contain orders of magnitude fewer images than typically required for deep learning, while achieving state-of-the-art performance metrics [<xref ref-type="bibr" rid="ref25">25</xref>].</p>
      <p>Several benchmark datasets are currently used to train AI models for attractiveness evaluation. The SCUT-FBP database contains 500 Asian female faces and achieved a Pearson correlation coefficient of 0.8187 between CNN predictions and human ratings [<xref ref-type="bibr" rid="ref21">21</xref>]. The expanded SCUT-FBP5500 dataset includes 5500 faces with more diverse demographics and achieved correlations of 0.87-0.90 with human evaluators [<xref ref-type="bibr" rid="ref26">26</xref>]. The Chicago Face Database contains 597 photographs of White and Black male and female individuals aged 17-65 years [<xref ref-type="bibr" rid="ref27">27</xref>]. Characteristics of each major dataset are detailed in <xref ref-type="table" rid="table1">Table 1</xref> [<xref ref-type="bibr" rid="ref21">21</xref>,<xref ref-type="bibr" rid="ref26">26</xref>-<xref ref-type="bibr" rid="ref32">32</xref>].</p>
      <table-wrap position="float" id="table1">
        <label>Table 1</label>
        <caption>
          <p>Comparison of major facial aesthetic datasets. Characteristics of 8 datasets commonly used to train and evaluate artificial intelligence (AI) models for facial attractiveness prediction, organized by publication year. Datasets were identified from the systematic literature review in this paper. Sample size reflects the most current reported demographic information for each dataset.</p>
        </caption>
        <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
          <col width="110"/>
          <col width="140"/>
          <col width="180"/>
          <col width="140"/>
          <col width="130"/>
          <col width="160"/>
          <col width="140"/>
          <thead>
            <tr valign="top">
              <td>Database</td>
              <td>Sample size (N); age range (years)</td>
              <td>Demographics (race, ethnicity, and gender)</td>
              <td>Standardized images</td>
              <td>Primary use</td>
              <td>Key limitations</td>
              <td>Validation metrics</td>
            </tr>
          </thead>
          <tbody>
            <tr valign="top">
              <td>SCUT-FBP (2015) [<xref ref-type="bibr" rid="ref21">21</xref>]</td>
              <td>N=500; not reported</td>
              <td>
                <list list-type="bullet">
                  <list-item>
                    <p>100% Asian</p>
                  </list-item>
                  <list-item>
                    <p>100% Female</p>
                  </list-item>
                </list>
              </td>
              <td>Yes (controlled laboratory conditions)</td>
              <td>Facial attractiveness prediction benchmark</td>
              <td>
                <list list-type="bullet">
                  <list-item>
                    <p>Small sample</p>
                  </list-item>
                  <list-item>
                    <p>Single demographic (Asian females)</p>
                  </list-item>
                  <list-item>
                    <p>Limited generalizability</p>
                  </list-item>
                </list>
              </td>
              <td>Pearson <italic>r</italic>=0.82</td>
            </tr>
            <tr valign="top">
              <td>SCUT-FBP5500 (2018) [<xref ref-type="bibr" rid="ref26">26</xref>]</td>
              <td>N=5500; 15-60</td>
              <td>
                <list list-type="bullet">
                  <list-item>
                    <p>73% Asian</p>
                  </list-item>
                  <list-item>
                    <p>27% Caucasian</p>
                  </list-item>
                  <list-item>
                    <p>50% Male</p>
                  </list-item>
                  <list-item>
                    <p>50% Female</p>
                  </list-item>
                </list>
              </td>
              <td>No (aggregated from multiple sources)</td>
              <td>Multiparadigm facial attractiveness prediction</td>
              <td>
                <list list-type="bullet">
                  <list-item>
                    <p>Limited ethnic diversity (Asian/Caucasian only)</p>
                  </list-item>
                  <list-item>
                    <p>Nonstandardized images</p>
                  </list-item>
                  <list-item>
                    <p>Variable quality</p>
                  </list-item>
                </list>
              </td>
              <td>Pearson <italic>r</italic>=0.87-0.90</td>
            </tr>
            <tr valign="top">
              <td>Chicago Face Database (2015) [<xref ref-type="bibr" rid="ref27">27</xref>]</td>
              <td>N=597; 17-65</td>
              <td>
                <list list-type="bullet">
                  <list-item>
                    <p>White</p>
                  </list-item>
                  <list-item>
                    <p>Black</p>
                  </list-item>
                  <list-item>
                    <p>Asian</p>
                  </list-item>
                  <list-item>
                    <p>Latino</p>
                  </list-item>
                  <list-item>
                    <p>Male</p>
                  </list-item>
                  <list-item>
                    <p>Female</p>
                  </list-item>
                  <list-item>
                    <p>Percentages unknown</p>
                  </list-item>
                </list>
              </td>
              <td>Yes (laboratory photography, standardized conditions)</td>
              <td>Research and AI model training across demographics</td>
              <td>
                <list list-type="bullet">
                  <list-item>
                    <p>Small sample</p>
                  </list-item>
                  <list-item>
                    <p>May not capture full diversity</p>
                  </list-item>
                </list>
              </td>
              <td>Attractiveness ratings by independent raters with interrater reliability; includes mean scores and SDs for multiple attributes</td>
            </tr>
            <tr valign="top">
              <td>BLINQ (dating site dataset) (not reported) [<xref ref-type="bibr" rid="ref30">30</xref>]</td>
              <td>N&#62;13,000; not reported</td>
              <td>
                <list list-type="bullet">
                  <list-item>
                    <p>Not specified (unknown demographics)</p>
                  </list-item>
                </list>
              </td>
              <td>No (collected from dating websites)</td>
              <td>CNN<sup>a</sup> for attractiveness training</td>
              <td>
                <list list-type="bullet">
                  <list-item>
                    <p>Nonstandardized</p>
                  </list-item>
                  <list-item>
                    <p>Unknown demographics</p>
                  </list-item>
                  <list-item>
                    <p>Selection bias</p>
                  </list-item>
                  <list-item>
                    <p>Rating bias</p>
                  </list-item>
                </list>
              </td>
              <td>Not reported</td>
            </tr>
            <tr valign="top">
              <td>Labeled Faces in the Wild (2007) [<xref ref-type="bibr" rid="ref28">28</xref>]</td>
              <td>N=5749; not reported</td>
              <td>
                <list list-type="bullet">
                  <list-item>
                    <p>77.5% Male</p>
                  </list-item>
                  <list-item>
                    <p>83.5% White</p>
                  </list-item>
                  <list-item>
                    <p>Remaining unknown</p>
                  </list-item>
                </list>
              </td>
              <td>No (captured from online articles/press)</td>
              <td>Facial recognition<sup>b</sup></td>
              <td>
                <list list-type="bullet">
                  <list-item>
                    <p>Severe demographic bias</p>
                  </list-item>
                  <list-item>
                    <p>Not designed for aesthetics</p>
                  </list-item>
                  <list-item>
                    <p>Nonstandardized</p>
                  </list-item>
                </list>
              </td>
              <td>Not applicable (facial recognition dataset)</td>
            </tr>
            <tr valign="top">
              <td>MEBeauty (2022) [<xref ref-type="bibr" rid="ref29">29</xref>]</td>
              <td>N=2550; not reported</td>
              <td>
                <list list-type="bullet">
                  <list-item>
                    <p>38% White</p>
                  </list-item>
                  <list-item>
                    <p>12% Black</p>
                  </list-item>
                  <list-item>
                    <p>14% Asian</p>
                  </list-item>
                  <list-item>
                    <p>12% Indian</p>
                  </list-item>
                  <list-item>
                    <p>12% Middle Eastern</p>
                  </list-item>
                  <list-item>
                    <p>12% Hispanic</p>
                  </list-item>
                  <list-item>
                    <p>51% Female</p>
                  </list-item>
                  <list-item>
                    <p>49% Male</p>
                  </list-item>
                </list>
              </td>
              <td>No (in-the-wild collection)</td>
              <td>Facial beauty prediction with ethnic diversity</td>
              <td>
                <list list-type="bullet">
                  <list-item>
                    <p>Small sample</p>
                  </list-item>
                  <list-item>
                    <p>Nonstandardized</p>
                  </list-item>
                  <list-item>
                    <p>Limited validation</p>
                  </list-item>
                  <list-item>
                    <p>Minimal literature usage</p>
                  </list-item>
                </list>
              </td>
              <td>Pearson <italic>r</italic> with human ratings; transfer learning validation</td>
            </tr>
            <tr valign="top">
              <td>FairFace (2021) [<xref ref-type="bibr" rid="ref31">31</xref>]</td>
              <td>N=108,501; 0-70+</td>
              <td>
                <list list-type="bullet">
                  <list-item>
                    <p>50% Female</p>
                  </list-item>
                  <list-item>
                    <p>50% Male</p>
                  </list-item>
                  <list-item>
                    <p>14.3% across major racial groups</p>
                  </list-item>
                </list>
              </td>
              <td>No (in-the-wild collection)</td>
              <td>Facial recognition<sup>b</sup></td>
              <td>
                <list list-type="bullet">
                  <list-item>
                    <p>Not designed for aesthetics</p>
                  </list-item>
                  <list-item>
                    <p>Nonstandardized</p>
                  </list-item>
                  <list-item>
                    <p>No attractiveness ratings</p>
                  </list-item>
                </list>
              </td>
              <td>Cross-dataset accuracy; fairness evaluation across demographics</td>
            </tr>
            <tr valign="top">
              <td>Diversity in Faces (2019) [<xref ref-type="bibr" rid="ref32">32</xref>]</td>
              <td>N=1,000,000; not reported</td>
              <td>Not categorized by race/ethnicity—uses objective facial coding schemes</td>
              <td>No (in-the-wild collection)</td>
              <td>Facial recognition<sup>b</sup></td>
              <td>
                <list list-type="bullet">
                  <list-item>
                    <p>No race/ethnicity labels</p>
                  </list-item>
                  <list-item>
                    <p>No attractiveness ratings</p>
                  </list-item>
                </list>
              </td>
              <td>Not reported</td>
            </tr>
          </tbody>
        </table>
        <table-wrap-foot>
          <fn id="table1fn1">
            <p><sup>a</sup>CNN: convolutional neural network.</p>
          </fn>
          <fn id="table1fn2">
            <p><sup>b</sup>Labeled Faces in the Wild, FairFace, and Diversity in Faces are facial recognition datasets not designed for attractiveness evaluation. They are included because they are widely used as pretraining resources, bias benchmarks, and demographic diversity references in the facial attractiveness AI literature.</p>
          </fn>
        </table-wrap-foot>
      </table-wrap>
      <p>As datasets increase in size, standardization becomes increasingly challenging, with many datasets containing mixed lighting conditions, facial expressions, and makeup [<xref ref-type="bibr" rid="ref30">30</xref>]. A study comparing CNN models trained on the BLINQ dating site database (containing over 13,000 nonstandardized images) vs models trained on BLINQ and then fine-tuned on the standardized Chicago Face Database [<xref ref-type="bibr" rid="ref27">27</xref>] demonstrated that the 2-step training approach resulted in less variability in attractiveness scores, as facial expressions were shown to confound assessments [<xref ref-type="bibr" rid="ref30">30</xref>]. These findings underscore the need for purpose-built training systems that can also account for facial expressions.</p>
      <p>A critical challenge in current AI/ML training is the feedback loop problem known as “performativity,” where a model’s predictions influence future data distribution [<xref ref-type="bibr" rid="ref33">33</xref>]. Models trained on specific beauty standards inevitably amplify those standards over time [<xref ref-type="bibr" rid="ref15">15</xref>]. Bias can be introduced at multiple stages of the AI/ML development lifecycle [<xref ref-type="bibr" rid="ref34">34</xref>], including targeting bias (defining beauty standards), data acquisition bias (using homogeneous datasets), modeling bias, validation and evaluation bias, and deployment and monitoring bias. Without intentional design for objective assessment, these biases lead to inaccurate and culturally inappropriate aesthetic evaluations.</p>
    </sec>
    <sec>
      <title>Review of Bias Manifestations</title>
      <p>Demographic bias is pervasive in facial recognition and aesthetic AI models, resulting in unreliable predictions for underrepresented groups. In a comprehensive evaluation of facial recognition vendor tests, the National Institute of Standards and Technology (NIST) found significant performance variability across different demographic groups and advised users to be aware of these disparities when selecting algorithms [<xref ref-type="bibr" rid="ref35">35</xref>]. Facial analysis benchmarks demonstrate particularly unreliable predictions for underrepresented demographics, such as females with higher Fitzpatrick skin types [<xref ref-type="bibr" rid="ref14">14</xref>]. Beyond racial and gender bias, certain age groups, particularly individuals aged older than 60 years, are underrepresented in training data, further compromising accuracy in aesthetic evaluation for these populations [<xref ref-type="bibr" rid="ref36">36</xref>].</p>
      <p>As noted, Eurocentric beauty standards serve as the default framework in AI models [<xref ref-type="bibr" rid="ref4">4</xref>]. Features flagged as “flaws” may represent valued ethnic traits: broader nasal bridges characteristic of Arab populations are marked for “correction,” while fuller lips common in North African ethnicities are classified as “disproportionate” [<xref ref-type="bibr" rid="ref4">4</xref>]. Numerous studies have demonstrated that the golden ratio and other neoclassical canons inadequately capture attractiveness across diverse populations [<xref ref-type="bibr" rid="ref11">11</xref>]. Consequently, AI models built according to singular beauty standards risk inadvertently recommending the westernization of ethnic features without consideration of cultural appropriateness [<xref ref-type="bibr" rid="ref4">4</xref>].</p>
      <p>Regional and cultural aesthetic preferences vary significantly. Patients of Middle Eastern and North African descent often seek nasal tip refinement while preserving other ethnic characteristics such as dorsal height [<xref ref-type="bibr" rid="ref4">4</xref>,<xref ref-type="bibr" rid="ref37">37</xref>]. East Asian patients typically prioritize augmentation procedures over reduction [<xref ref-type="bibr" rid="ref13">13</xref>]. Despite significant cross-cultural variation in motivations, patient expectations frequently include conscious avoidance of “westernized” appearance [<xref ref-type="bibr" rid="ref37">37</xref>], and high satisfaction correlates strongly with preservation of ethnic features [<xref ref-type="bibr" rid="ref38">38</xref>].</p>
    </sec>
    <sec>
      <title>Review of Existing Bias Mitigation Approaches</title>
      <p>Researchers have developed bias mitigation strategies targeting different stages of AI model development. At the dataset level, initiatives like FairFace have created balanced demographic representations across race, gender, and age categories to address training data imbalances [<xref ref-type="bibr" rid="ref31">31</xref>]. Synthetic data generation using generative adversarial networks has been proposed to augment underrepresented demographic categories.</p>
      <p>Multiple algorithmic interventions have been developed to detect and reduce bias during model training. Adversarial debiasing methods have demonstrated improved fairness outcomes by mitigating bias acquired during data collection [<xref ref-type="bibr" rid="ref39">39</xref>]. Posttraining corrections, such as centroid fairness loss, enable bias measurement and performance alignment across demographic groups without requiring complete model retraining [<xref ref-type="bibr" rid="ref40">40</xref>]. Skewness-aware reinforcement learning approaches address data distribution imbalances [<xref ref-type="bibr" rid="ref41">41</xref>], while techniques like debiasing variational autoencoders adjust sampling probabilities for underrepresented categories [<xref ref-type="bibr" rid="ref42">42</xref>]. Meta-learning approaches enable models to adapt to regional aesthetic standards with limited culture-specific training data [<xref ref-type="bibr" rid="ref43">43</xref>].</p>
      <p>Bias detection has also advanced through evaluation methodologies. Researchers have documented systematic performance disparities across demographic groups in facial analysis systems. Studies examining intersectional accuracy gaps have revealed that these disparities stem from complex structural factors: for example, poor performance on dark-skinned females in gender classification results not from skin tone itself, but from differences in lip, eye, and cheek structure across ethnicities [<xref ref-type="bibr" rid="ref44">44</xref>].</p>
    </sec>
    <sec>
      <title>Gaps in Current Approaches</title>
      <p>Despite these advances in bias detection and mitigation, critical gaps remain in their application to aesthetic facial evaluation. First, current approaches address bias mitigation at isolated stages rather than across the complete AI development lifecycle. Techniques such as adversarial debiasing [<xref ref-type="bibr" rid="ref39">39</xref>], centroid fairness loss [<xref ref-type="bibr" rid="ref40">40</xref>], skewness-aware reinforcement learning [<xref ref-type="bibr" rid="ref41">41</xref>], and debiasing variational autoencoders [<xref ref-type="bibr" rid="ref42">42</xref>] have been validated in isolation primarily for facial recognition tasks—where the objective is identity verification—rather than aesthetic evaluation, where subjective cultural beauty standards introduce fundamentally different fairness challenges.</p>
      <p>Recent empirical evidence reveals fundamental inadequacies in current mitigation strategies. Dataset diversification efforts like FairFace [<xref ref-type="bibr" rid="ref31">31</xref>] balance demographic representation yet fail to address annotation bias, the systematic application of culturally-specific aesthetic judgments by raters during labeling [<xref ref-type="bibr" rid="ref29">29</xref>]. A facial beauty prediction model trained on the multiethnic MEBeauty dataset exhibited significant prediction disparities across ethnic groups (<italic>P</italic>&#60;.001) even when evaluated on balanced data, with only 4.8%-9.5% of intergroup comparisons satisfying distributional parity criteria [<xref ref-type="bibr" rid="ref45">45</xref>]. More concerning, models demonstrated exacerbated bias on balanced demographic datasets compared to training performance, indicating that current approaches may amplify rather than mitigate societal biases when deployed to real-world populations [<xref ref-type="bibr" rid="ref45">45</xref>].</p>
      <p>Second, no comprehensive framework integrates technical solutions with essential nontechnical components across the AI development lifecycle. Existing approaches lack systematic stakeholder engagement, explainability requirements, and governance structures. While participatory design approaches exist for health care AI broadly [<xref ref-type="bibr" rid="ref46">46</xref>], aesthetic surgery applications do not systematically involve patients, clinicians, and cultural consultants in model development. Postdeployment monitoring, essential for detecting fairness drift and performance degradation across demographic subgroups [<xref ref-type="bibr" rid="ref4">4</xref>,<xref ref-type="bibr" rid="ref35">35</xref>], remains absent from aesthetic AI implementations, with validation protocols testing for bias before deployment rarely incorporated into development pipelines [<xref ref-type="bibr" rid="ref45">45</xref>].</p>
      <p>Third, the field lacks consensus on which fairness metrics are most appropriate for aesthetic contexts. While demographic parity, equalized odds, and equal opportunity are well-defined [<xref ref-type="bibr" rid="ref47">47</xref>], their application to aesthetic evaluation poses unique challenges: achieving demographic parity may conflict with honoring culturally-specific beauty standards [<xref ref-type="bibr" rid="ref4">4</xref>], and determining appropriate trade-offs requires stakeholder input that current approaches do not systematically incorporate. These gaps necessitate an integrated framework specifically designed for the unique challenges of bias mitigation in aesthetic facial evaluation.</p>
    </sec>
    <sec>
      <title>Proposed Framework for AI/ML Training in Aesthetic Facial Evaluation</title>
      <sec>
        <title>Overview</title>
        <p>To address these gaps, we propose the following framework for the training of AI/ML models in aesthetic evaluation that consists of 6 pillars: data collection and curation, model training methodologies, fairness metrics and evaluation, explainability and transparency, stakeholder engagement, and governance and monitoring.</p>
        <p>This framework addresses AI/ML systems used across several distinct aesthetic evaluation tasks, which differ substantially in their fairness stakes and acceptable error thresholds. Attractiveness scoring assigns a rating or ranking to a face and is used primarily for research benchmarking. Preoperative planning uses AI to assess anatomic features and inform surgical approach, where systematic undervaluation of ethnic features could directly influence clinical decision-making. Outcome simulation generates predicted postoperative appearance, where bias has the additional potential to alter rendered ethnicity rather than merely undervalue it, a qualitatively distinct harm addressed further in the framework. Patient counseling involves AI-assisted communication of options, where biased framing may subtly steer patient choices. Fairness requirements and error thresholds should be calibrated to the stakes of the specific use case; the more downstream the application and the more directly it affects patient choice and surgical planning, the more stringent the requirements.</p>
        <p>Throughout this framework, recommendations are categorized by their evidence base (<xref ref-type="table" rid="table2">Table 2</xref>). Practices marked as “established” draw on studies conducted in aesthetic facial evaluation contexts. Practices marked as “adapted” are supported by evidence from adjacent domains, primarily facial recognition, general medical AI, or computer vision, and have been translated to aesthetic evaluation by analogy; these require validation in aesthetic-specific contexts before adoption as standard practice. Practices marked as “proposed” represent conceptual recommendations without empirical validation in any closely related domain and should be treated as research directions.</p>
        <table-wrap position="float" id="table2">
          <label>Table 2</label>
          <caption>
            <p>Evidence classification for framework recommendations. Summary of bias mitigation practices included in the proposed framework, organized by strength of evidence. Established evidence base indicates practices supported by studies conducted in aesthetic facial evaluation contexts. Adapted evidence base indicates practices supported by evidence from adjacent domains (facial recognition, general medical AI, or computer vision) that have been translated to aesthetic evaluation by analogy and require validation in aesthetic-specific contexts before adoption as standard practice. Proposed evidence base indicates conceptual recommendations without empirical validation in any closely related domain, to be treated as research directions.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="200"/>
            <col width="150"/>
            <col width="270"/>
            <col width="140"/>
            <col width="240"/>
            <thead>
              <tr valign="top">
                <td>Bias source</td>
                <td>Pipeline stage</td>
                <td>Mitigation strategy</td>
                <td>Evidence base</td>
                <td>Residual gap</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>Targeting bias: narrow beauty standard definition</td>
                <td>Data collection</td>
                <td>
                  <list list-type="bullet">
                    <list-item>
                      <p>Pillar 1: ≥7 ethnic categories</p>
                    </list-item>
                    <list-item>
                      <p>Mixed-ancestry probabilistic labeling</p>
                    </list-item>
                    <list-item>
                      <p>Continuous morphometric representation</p>
                    </list-item>
                  </list>
                </td>
                <td>Established</td>
                <td>
                  <list list-type="bullet">
                    <list-item>
                      <p>Discrete ethnic categories essentialize group-level patterns</p>
                    </list-item>
                    <list-item>
                      <p>Intragroup variation (eg, nationality, socioeconomic background) is rarely captured</p>
                    </list-item>
                  </list>
                </td>
              </tr>
              <tr valign="top">
                <td>Annotation bias: culturally skewed rater judgments</td>
                <td>Data collection</td>
                <td>
                  <list list-type="bullet">
                    <list-item>
                      <p>Pillar 1: diverse rater recruitment</p>
                    </list-item>
                    <list-item>
                      <p>Structured training and calibration</p>
                    </list-item>
                    <list-item>
                      <p>Tiered disagreement adjudication</p>
                    </list-item>
                    <list-item>
                      <p>Ongoing score audits</p>
                    </list-item>
                  </list>
                </td>
                <td>Adapted</td>
                <td>
                  <list list-type="bullet">
                    <list-item>
                      <p>No validated rater calibration protocol exists for aesthetic evaluation</p>
                    </list-item>
                    <list-item>
                      <p>Cultural feature weighting may persist despite diverse panels</p>
                    </list-item>
                  </list>
                </td>
              </tr>
              <tr valign="top">
                <td>Generative adversarial network amplification bias: synthetic augmentation</td>
                <td>Data collection</td>
                <td>
                  <list list-type="bullet">
                    <list-item>
                      <p>Pillar 1: quality control gate for synthetic images—fairness audit, feature distribution check, and human review</p>
                    </list-item>
                  </list>
                </td>
                <td>Adapted</td>
                <td>
                  <list list-type="bullet">
                    <list-item>
                      <p>Mode collapse and feature exaggeration are documented in generative systems</p>
                    </list-item>
                    <list-item>
                      <p>Quality control criteria not validated for aesthetic contexts</p>
                    </list-item>
                  </list>
                </td>
              </tr>
              <tr valign="top">
                <td>Modeling bias: fairness-unaware training</td>
                <td>Model training</td>
                <td>
                  <list list-type="bullet">
                    <list-item>
                      <p>Pillar 2: adversarial debiasing</p>
                    </list-item>
                    <list-item>
                      <p>Centroid fairness loss</p>
                    </list-item>
                    <list-item>
                      <p>Skewness-aware reinforcement learning</p>
                    </list-item>
                    <list-item>
                      <p>Debiasing variational autoencoder</p>
                    </list-item>
                    <list-item>
                      <p>Multitask learning</p>
                    </list-item>
                  </list>
                </td>
                <td>Adapted</td>
                <td>
                  <list list-type="bullet">
                    <list-item>
                      <p>All techniques validated in facial recognition or general computer vision, not aesthetic evaluation</p>
                    </list-item>
                    <list-item>
                      <p>Several rest on preprint evidence</p>
                    </list-item>
                    <list-item>
                      <p>Combined validation absent</p>
                    </list-item>
                  </list>
                </td>
              </tr>
              <tr valign="top">
                <td>Domain shift bias: train/deploy distribution mismatch</td>
                <td>Model training → deployment</td>
                <td>
                  <list list-type="bullet">
                    <list-item>
                      <p>Pillar 2: hybrid pretraining on standardized images</p>
                    </list-item>
                    <list-item>
                      <p>Fine-tuning on clinical images</p>
                    </list-item>
                    <list-item>
                      <p>Domain generalization evaluation prerelease</p>
                    </list-item>
                  </list>
                </td>
                <td>Proposed</td>
                <td>
                  <list list-type="bullet">
                    <list-item>
                      <p>No validated hybrid protocol for aesthetic AI</p>
                    </list-item>
                    <list-item>
                      <p>Clinical image variation not systematically characterized</p>
                    </list-item>
                  </list>
                </td>
              </tr>
              <tr valign="top">
                <td>Evaluation bias: aggregate metrics obscure subgroup disparities</td>
                <td>Evaluation</td>
                <td>
                  <list list-type="bullet">
                    <list-item>
                      <p>Pillar 3: layered fairness metrics with prioritization hierarchy</p>
                    </list-item>
                    <list-item>
                      <p>Intersectional assessment</p>
                    </list-item>
                    <list-item>
                      <p>Bayesian hierarchical modeling for rare subgroups</p>
                    </list-item>
                  </list>
                </td>
                <td>Adapted</td>
                <td>
                  <list list-type="bullet">
                    <list-item>
                      <p>Thresholds are proposed benchmarks without empirical derivation</p>
                    </list-item>
                    <list-item>
                      <p>Metrics can conflict</p>
                    </list-item>
                    <list-item>
                      <p>Intersectional sample sizes often insufficient</p>
                    </list-item>
                  </list>
                </td>
              </tr>
              <tr valign="top">
                <td>Explainability gap: black-box outputs in a cultural context</td>
                <td>Evaluation → deployment</td>
                <td>
                  <list list-type="bullet">
                    <list-item>
                      <p>Pillar 4: Grad-CAM<sup>a</sup>, LIME<sup>b</sup>, SHAP<sup>c</sup> with required human expert review</p>
                    </list-item>
                    <list-item>
                      <p>Geometric/physics-based models as a longer-term goal</p>
                    </list-item>
                  </list>
                </td>
                <td>Adapted</td>
                <td>
                  <list list-type="bullet">
                    <list-item>
                      <p>Explainable AI tools cannot explain why features are culturally valued</p>
                    </list-item>
                    <list-item>
                      <p>No method validated for cultural appropriateness verification in aesthetic AI</p>
                    </list-item>
                  </list>
                </td>
              </tr>
              <tr valign="top">
                <td>Human–AI decision bias: clinician interpretation and override</td>
                <td>Deployment</td>
                <td>
                  <list list-type="bullet">
                    <list-item>
                      <p>Pillar 5 + 6: documentation and audit of AI recommendation override rates by patient demographic</p>
                    </list-item>
                    <list-item>
                      <p>Clinician training on implicit bias</p>
                    </list-item>
                  </list>
                </td>
                <td>Proposed</td>
                <td>
                  <list list-type="bullet">
                    <list-item>
                      <p>No empirical data on differential override in aesthetic AI</p>
                    </list-item>
                    <list-item>
                      <p>Audit infrastructure absent</p>
                    </list-item>
                    <list-item>
                      <p>Accountability for remediation undefined</p>
                    </list-item>
                  </list>
                </td>
              </tr>
              <tr valign="top">
                <td>Deployment bias: commercial systems without governance</td>
                <td>Deployment</td>
                <td>
                  <list list-type="bullet">
                    <list-item>
                      <p>Pillar 6: disclosure-based accountability for commercial developers</p>
                    </list-item>
                    <list-item>
                      <p>FDA SaMD<sup>d</sup> framework alignment</p>
                    </list-item>
                  </list>
                </td>
                <td>Proposed</td>
                <td>
                  <list list-type="bullet">
                    <list-item>
                      <p>No enforcement mechanism for commercial tools</p>
                    </list-item>
                    <list-item>
                      <p>Patient and clinician verification of compliance is currently impossible</p>
                    </list-item>
                  </list>
                </td>
              </tr>
              <tr valign="top">
                <td>Drift bias: postdeployment fairness degradation</td>
                <td>Monitoring</td>
                <td>
                  <list list-type="bullet">
                    <list-item>
                      <p>Pillar 6: tiered monitoring—continuous process control, quarterly review, annual audit, drift-triggered escalation</p>
                    </list-item>
                    <list-item>
                      <p>Designated AI clinical lead</p>
                    </list-item>
                  </list>
                </td>
                <td>Adapted</td>
                <td>
                  <list list-type="bullet">
                    <list-item>
                      <p>Drift thresholds not empirically derived for aesthetic AI</p>
                    </list-item>
                    <list-item>
                      <p>Continuous monitoring may not be feasible for community practices</p>
                    </list-item>
                  </list>
                </td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table2fn1">
              <p><sup>a</sup>Grad-CAM: gradient-weighted class activation mapping.</p>
            </fn>
            <fn id="table2fn2">
              <p><sup>b</sup>LIME: local interpretable model-agnostic explanations.</p>
            </fn>
            <fn id="table2fn3">
              <p><sup>c</sup>SHAP: Shapley additive explanations.</p>
            </fn>
            <fn id="table2fn4">
              <p><sup>d</sup>FDA SaMD: Food and Drug Administration’s Software as a Medical Device.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
      </sec>
      <sec>
        <title>Data Collection and Curation</title>
        <p>As a pragmatic baseline informed by existing benchmark datasets, training data should include a balanced representation of at least 7 racial and ethnic categories [<xref ref-type="bibr" rid="ref31">31</xref>] (White, Black, East Asian, Southeast Asian, Middle Eastern, Latino, and Indian), with additional stratification by gender, age, nationality, and socioeconomic background. This should include multiregional data collection with noted region-specific aesthetic preferences [<xref ref-type="bibr" rid="ref4">4</xref>,<xref ref-type="bibr" rid="ref29">29</xref>]. This scheme is explicitly a minimum starting point, not a definitive classification; implementations should adopt more granular schemes as data availability permits. Individuals of mixed ancestry, a rapidly growing population, should be accommodated through multilabel or probabilistic ancestry representation rather than forced assignment to a single category. As the field matures, continuous morphometric representations of facial ancestry, such as principal components of facial geometry derived from diverse reference populations, offer a more biologically-grounded alternative to discrete ethnic labels and should be pursued to reduce the risk of essentialization.</p>
        <p>While standardized photographs in a constrained environment are necessary to reduce training inaccuracies from facial expression confounds [<xref ref-type="bibr" rid="ref30">30</xref>], they introduce a domain-shift risk at deployment: real-world clinical images routinely involve variation in lighting, angle, makeup, and expression that differs systematically from controlled training conditions. To address this, we recommend a hybrid protocol: initial pretraining on standardized images to establish controlled baseline representations, followed by fine-tuning on a curated set of clinically realistic images incorporating documented augmentation strategies—including geometric transformations such as rotation and translation to simulate angle variation, and color space augmentations to simulate lighting variation—to reduce the gap between training and deployment distributions [<xref ref-type="bibr" rid="ref48">48</xref>]. Explicit evaluation of domain generalization, measuring performance and fairness metric stability across both standardized and nonstandardized image sets, should be required before clinical release.</p>
        <p>Even with standardized imaging protocols, acquiring sufficient photographs across all demographic categories remains challenging. Synthetic data generation can bridge these gaps while also protecting privacy, with pretraining gap analysis using established benchmark datasets such as FairFace [<xref ref-type="bibr" rid="ref31">31</xref>] or Diversity in Faces [<xref ref-type="bibr" rid="ref32">32</xref>] used to identify demographic imbalances or biases; these have been established in facial attribute classification, though their applicability to aesthetic evaluation requires confirmation. Generative models are themselves trained on real-world data containing existing biases and are susceptible to mode collapse, where the model produces a narrow range of outputs disproportionately representing dominant features, and to hallucination of exaggerated demographic characteristics when conditioned on ethnic labels. These failure modes risk reintroducing the essentialized representations that the framework is designed to prevent. Accordingly, synthetic images must not enter the training set without explicit quality-control steps: adversarial fairness auditing of synthetic outputs to detect feature exaggeration; statistical comparison of synthetic image feature distributions against reference population benchmarks; and human review by the diverse rater panels proposed elsewhere in this framework. Only images passing all 3 criteria should be incorporated.</p>
        <p>Furthermore, photograph raters should be recruited from diverse cultural backgrounds to avoid annotation bias [<xref ref-type="bibr" rid="ref29">29</xref>]. Annotation bias occurs when raters systematically apply their own cultural aesthetic standards to evaluate faces from different backgrounds, but the patterns of bias are complex and do not reduce to simple in-group favoritism. Research demonstrates that rater ethnicity influences which facial features are emphasized—for instance, Chinese observers associate lighter skin tones with attractiveness in own-ethnicity faces, whereas European observers prefer warmer tones in Chinese faces [<xref ref-type="bibr" rid="ref8">8</xref>], and Japanese raters emphasize raised eyebrows in attractive male faces and smaller mouths in attractive female faces more than American raters [<xref ref-type="bibr" rid="ref9">9</xref>]. However, cross-cultural studies comparing attractiveness ratings across European, East Asian, and African faces found no strong own-race preference in overall attractiveness judgments [<xref ref-type="bibr" rid="ref49">49</xref>], indicating that annotation bias operates through subtle feature weighting rather than categorical group favoritism. For example, raters trained predominantly in Western aesthetic ideals might systematically underweight features like broader nasal bridges or fuller lips that are attractive within specific cultural contexts, not because of explicit racial preference but because their cultural training emphasizes different facial proportions. Recruiting ethnically diverse rater panels is therefore essential to ensure balanced representation of aesthetic preferences rather than assuming any single demographic composition will eliminate bias.</p>
      </sec>
      <sec>
        <title>Model Training Methodologies</title>
        <p>MTL is central to bias mitigation in AI/ML model training. Models should be simultaneously trained on age, gender, ethnicity, facial expression, and attractiveness ratings to develop a comprehensive and nuanced assessment of faces [<xref ref-type="bibr" rid="ref36">36</xref>] where feasible. Rater cultural background, region of upbringing, and socioeconomic status should be recorded as covariates to enable analysis of how these dimensions influence annotation.</p>
        <p>Several fairness-aware techniques should be implemented during model training, adapted from the facial recognition literature but not yet validated in aesthetic evaluation. Adversarial learning methods [<xref ref-type="bibr" rid="ref39">39</xref>] should be applied during the training phase to mitigate bias acquired during data collection. Posttraining, centroid fairness loss [<xref ref-type="bibr" rid="ref40">40</xref>] enables bias measurement and performance alignment across demographic groups without requiring complex model retraining, a significant practical advantage. Skewness-aware reinforcement learning [<xref ref-type="bibr" rid="ref41">41</xref>] should be used to recognize and adjust imbalances in data distribution or model performance across demographics. Finally, debiasing variational autoencoder [<xref ref-type="bibr" rid="ref42">42</xref>] can adjust sampling probabilities for underrepresented categories, balancing the effective training data to enable more equitable performance across patient populations.</p>
        <p>Beyond these fairness-aware training techniques, AI/ML models can be designed to adapt to regional aesthetic standards for facial evaluation. A meta-learning approach, supported by preliminary evidence from the beauty prediction literature, enables models to “learn how to learn”: models trained on learning tasks from a range of cultures develop the ability to adapt to other cultural preferences more readily [<xref ref-type="bibr" rid="ref43">43</xref>]. This methodology can account for the subjective nature of beauty perception across cultures and allow customization based on patient population.</p>
        <p>While meta-learning enables adaptation to regional standards, the hierarchical structure of aesthetic preferences, with both universal and ethnicity-specific components, suggests opportunities for more sophisticated architectural approaches. Hierarchical Bayesian models could naturally encode this structure through multilevel parameter sharing, where population-level priors capture universal features while group-specific parameters account for cultural variation. Alternatively, the partial invariance framework extends invariant risk minimization by learning features that are invariant within partitions of training environments rather than globally invariant across all environments [<xref ref-type="bibr" rid="ref50">50</xref>]. Such approaches may encode averageness-related objectives within cultural partitions rather than across the full training distribution, preserving within-group distinctiveness while limiting cross-group homogenization. Universal structural features would be encoded at the population level only when evidence supports genuine cross-cultural validity, while regionally variable features remain governed by culture-specific parameters. Efficient multigroup equivariant techniques that address intersectional fairness across combinations of protected attributes, such as ethnicity and gender, may offer additional methodological directions [<xref ref-type="bibr" rid="ref51">51</xref>]. However, these techniques have not yet been validated for subjective aesthetic judgments, and their applicability to facial evaluation remains an open empirical question.</p>
        <p>Until such validation is established, these advanced architectural approaches should be considered active research directions rather than recommended clinical components. Several debiasing techniques in this section also rest on preprint or single-study evidence whose reproducibility has not been independently established; responsible clinical adoption requires, at a minimum, prospective studies demonstrating improvement in prespecified fairness metrics across multiple independent ethnic groups, independent reproducibility on held-out datasets, and direct comparison against uncontrolled baseline systems under clinically realistic conditions.</p>
      </sec>
      <sec>
        <title>Fairness Metrics and Evaluation</title>
        <p>A comprehensive fairness evaluation system integrates group fairness metrics, performance standards, and intersectional assessment. This layered approach prevents the common problem of achieving fairness on average while still having significant disparities in specific subpopulations, which is particularly critical in aesthetic facial evaluation, where cultural considerations vary significantly across intersectional identities.</p>
        <p>Fairness metrics are chosen based on the clinical application to facial evaluation. Group fairness metrics measure and reduce bias to ensure that models evaluate different demographic groups equitably [<xref ref-type="bibr" rid="ref47">47</xref>]. Demographic parity catches outcome bias by ensuring that the models’ aesthetic ratings are independent of ethnicity and equal across different demographic groups. Since the clinical utility of these models requires accuracy as well, equalized odds catches accuracy bias by ensuring that the models perform equally well across all demographic groups, with false positives and false negatives occurring equally. Additionally, the evaluation system should include equal opportunity metrics as an option for clinicians when identifying positive outcomes is the priority and false positives are not an issue. In an aesthetic evaluation scenario, equal opportunity focuses on not missing attractive features while being more flexible about possible overestimation. However, different fairness metrics can oppose each other, which makes it axiomatically impossible to align all of them simultaneously.</p>
        <p>We propose the following benchmarks as starting points for community debate and empirical refinement, not as validated thresholds: cultural concordance scores of at least 80%, as reviewed by regional expert review panels [<xref ref-type="bibr" rid="ref4">4</xref>], feature recognition accuracy of at least 95% [<xref ref-type="bibr" rid="ref4">4</xref>], and demographic parity in prediction accuracy with no more than 5% variance across groups. These figures represent reasonable aspirational targets informed by analogous fairness benchmarks in other clinical AI applications but require prospective validation in aesthetic evaluation contexts before adoption as standards.</p>
        <p>Because fairness metrics will conflict in practice, a decision hierarchy is necessary. Consider the following scenario: a model achieves demographic parity (equal average attractiveness ratings across ethnic groups) but does so by consistently overpredicting attractiveness for underrepresented groups while underperforming on fine-grained feature recognition for those same groups. Demographic parity is satisfied; equalized odds are not. In this scenario, we recommend prioritizing equalized odds because accuracy parity across groups is a prerequisite to the clinical utility of the tool. A model that fails to accurately recognize features in specific ethnic groups cannot serve those patients equitably, regardless of average score distributions. Equal opportunity metrics should then be applied as a secondary check, specifically where the clinical priority is avoiding false negatives, for example, ensuring that attractive features in underrepresented populations are not systematically missed.</p>
        <p>This hierarchy assumes deployment in a pluralistic patient population. In settings where the clinical population is demographically homogeneous, the relevant fairness question shifts: the priority becomes within-population accuracy and avoidance of intragroup bias, rather than cross-group parity. In such contexts, fine-tuning on locally representative data may be both technically appropriate and ethically indicated, provided that the resulting model is transparently scoped to its intended deployment population and not generalized beyond it. A model developed and validated for a specific national or regional context, for example, a system trained primarily on Korean patients for deployment in South Korea, should be evaluated against locally derived aesthetic norms and demographic distributions, and its scope of applicability documented accordingly.</p>
        <p>Since the above-mentioned techniques evaluate for biases for individual criteria such as race or age, an intersectional assessment should be introduced into model training to evaluate overlapping biases that may compound. In aesthetic facial evaluation, intersectional bias due to the aggregation of multiple social identities, such as race, ethnicity, nationality, socioeconomic background, gender, and age, can incorrectly influence outcomes and may perpetuate stereotypes [<xref ref-type="bibr" rid="ref14">14</xref>]. An intersectional assessment provides the framework to apply metrics and standards across complex, overlapping demographic categories such as Black women or older Asian men.</p>
        <p>Since individuals hold multiple overlapping identities simultaneously, each with associated social norms and expectations, aesthetic preferences cannot be understood by analyzing demographic categories in isolation, a theoretical foundation reinforcing that intersectional assessment addresses the fundamental mechanism through which identity influences perception [<xref ref-type="bibr" rid="ref52">52</xref>].</p>
        <p>Practically, however, current dataset sizes preclude stable estimates across all intersectional combinations, necessitating a prioritization hierarchy. High-risk intersections with documented performance disparities—such as older women with higher Fitzpatrick skin types—should be evaluated as the primary tier. Compositional approaches offer a promising avenue for addressing certain intersectionality challenges more tractably: multigroup equivariant network designs that use product groups can provide fairness guarantees across intersectional demographic combinations with computational complexity proportional to the sum rather than the product of group sizes, as demonstrated in natural language generation debiasing tasks [<xref ref-type="bibr" rid="ref51">51</xref>]. Second, Bayesian hierarchical modeling of rare subgroups enables partial pooling of statistical strength from related intersections, providing more stable estimates for low-frequency cells; this approach ties naturally to the hierarchical architectures recommended in the Model Training Methodologies section. Third, multitask regularization can share statistical strength across related intersectional categories during training. Intersectional combinations not covered by the primary tier should be explicitly designated for future work rather than omitted without acknowledgment.</p>
      </sec>
      <sec>
        <title>Explainability and Transparency</title>
        <p>Explainability and transparency are prerequisites for trustworthy AI/ML systems in clinical use. Three different techniques adapted from general medical AI would be beneficial for training models used in aesthetic facial evaluation: gradient-weighted class activation mapping (Grad-CAM), local interpretable model-agnostic explanations (LIME), and Shapley additive explanations (SHAP) [<xref ref-type="bibr" rid="ref53">53</xref>]. Grad-CAM provides visual and spatial explanations through heatmaps that highlight which facial regions contribute most to a model’s aesthetic predictions. Clinicians can identify which anatomic feature the model prioritizes and can also verify that the model emphasizes culturally appropriate features rather than defaulting to Eurocentric beauty standards. LIME provides case-specific explanations of a model’s individual predictions by showing which features influenced that specific assessment. This technique is model-agnostic and therefore flexible and versatile, meaning it can be applied to any AI/ML architecture—whether convolutional neural networks, transformer models, or future technologies—making it adaptable as the field evolves. SHAP provides information about how each evaluated feature contributes to the specific output of a model. It can explain individual predictions and provide a global overview of which features are most important in a dataset. The practical value of SHAP-based interpretation has also been demonstrated in surgical predictive AI, where feature-level explanation was used to identify key perioperative risk factors and improve transparency of model behavior for clinical decision-making [<xref ref-type="bibr" rid="ref54">54</xref>]. While these methods continue to evolve with ongoing algorithmic refinements, these tools are useful but not sufficient for verifying cultural appropriateness in aesthetic evaluation. They identify which facial regions or features influence a model’s output, but do not explain why those features are aesthetically valued within a specific cultural context, which is the central interpretive question this framework is designed to address. Human expert review by culturally knowledgeable clinicians is, therefore, a required complement to explainable AI (XAI) output, not an optional one.</p>
        <p>Intrinsically interpretable approaches, which incorporate domain knowledge directly into model architecture rather than applying post hoc explanation methods, represent a more direct path toward clinical-grade cultural interpretability. For facial aesthetic evaluation, this includes explicitly encoding geometric relationships (such as facial proportions, angles, and distances) as structured features within the model, and physics-based models that incorporate established anatomical principles and morphometric relationships.</p>
        <p>Transparency standards should ensure that stakeholders have access to essential information about model development and validation. Transparency requirements for AI/ML training models should include the documentation of training data demographics and model architecture, interpretable explanations for aesthetic assessments, and disclosure of identified biases and performance disparities across demographics.</p>
        <p>For generative outcome simulation systems specifically, XAI tools must address an additional interpretive requirement: clinicians should be able to verify that simulated postoperative appearances reflect only the intended surgical modifications and do not introduce ethnically incongruent features as artifacts of model bias. This requires comparison of presimulation and postsimulation facial geometry at the feature level, which current Grad-CAM and LIME implementations are not designed to provide; bespoke evaluation protocols for generative systems are needed.</p>
      </sec>
      <sec>
        <title>Stakeholder Engagement and Participatory Design</title>
        <p>The development of AI models should involve AI developers, patients, clinicians, cultural consultants, and ethicists as equal partners from the conceptualization stage through implementation and postdeployment monitoring [<xref ref-type="bibr" rid="ref46">46</xref>]. This cocreation process ensures that diverse perspectives shape decision-making at every stage, from selecting training datasets to interpreting model outputs to refining algorithms based on real-world performance. Workshops and iterative feedback sessions should be conducted throughout the development lifecycle to gather input on critical questions such as which facial features should be prioritized for analysis, how to define culturally appropriate aesthetic outcomes, and whether model recommendations align with patient values and clinical judgment. Participant recruitment should deliberately include patients who identify across multiple marginalized dimensions simultaneously, such as older immigrant women from non-Western countries, as their aesthetic priorities and experiences of algorithmic bias are likely to differ from those captured by single-axis demographic sampling. It is vital to recognize and address potential power dynamics to ensure that underrepresented stakeholders’ voices are valued equally alongside technical experts and established institutions.</p>
        <p>Regional expert review panels convened to evaluate cultural concordance must be constituted with explicit accountability requirements. Panel composition should include representatives from diverse geographic regions within each cultural community, not only urban or elite centers, and should reflect variation in socioeconomic background, age, and gender. Selection criteria should be documented and publicly disclosed. Formal mechanisms for recording minority and dissenting views are required; panel reports should distinguish consensus from majority positions and preserve dissenting opinions for review. Periodic panel rotation and external audit of panel composition guard against the entrenchment of a single institutionalized aesthetic perspective. These panel requirements must be operationalized through rigorous rater training and calibration protocols.</p>
        <p>Rater training and calibration protocols are essential to annotation quality. Before scoring, raters should complete structured training that includes: an orientation to the study’s cultural equity goals; exposure to diverse face exemplars across all demographic categories to be rated; and explicit instruction to evaluate attractiveness according to within-group cultural standards rather than a universal ideal. Calibration should be conducted using a standardized set of anchor images—rated in advance by a culturally matched expert panel—against which individual rater scores are benchmarked. Raters whose scores diverge systematically from calibration anchors by more than a prespecified threshold (for example, mean absolute deviation greater than 1.0 on a 10-point scale) should receive additional training before contributing to the primary dataset. For images where rater scores span more than 3 points on a 10-point scale, the image should be flagged for adjudication by a culturally matched expert reviewer rather than resolved by averaging. Averaged scores obscure genuine aesthetic disagreement that may itself be informative about cultural variation. Ongoing audit of rater score distributions by demographic subgroup should be conducted throughout data collection to detect systematic drift in individual rater calibration.</p>
        <p>A human-centered design framework positions clinicians and patients as essential collaborators while respecting their cultural contexts and prior experiences. Patients who have undergone aesthetic procedures provide experiential knowledge about how cultural identity influences aesthetic goals, what features they sought to preserve vs modify, and how algorithmic recommendations might have impacted their decision-making. This bidirectional learning process builds cultural competency across all stakeholders and creates the foundation for AI systems that serve diverse populations equitably.</p>
        <p>Additionally, informed consent architecture is a prerequisite for ethical dataset development. Individuals depicted in training images must provide explicit consent for secondary use of their facial photographs in AI training, with the right to withdraw consent and have their images removed from future training cycles. This requirement applies regardless of whether images are sourced from clinical records, publicly available datasets, or social media platforms, and must account for jurisdiction-specific biometric privacy regulations and state-level statutes. However, consent withdrawal raises a technically significant challenge: once a model is trained on data, removing a data point’s influence from a deployed model without full retraining is computationally costly. The emerging field of machine unlearning addresses this problem through methods such as approximate unlearning and influence function-based data removal, though these techniques have not yet been validated or operationalized in medical AI governance contexts. Until practical machine unlearning protocols are established for clinical AI, consent frameworks should, at a minimum, guarantee removal from future retraining cycles and document this limitation transparently as a residual risk. Raters whose aesthetic judgments become training labels should similarly provide informed consent, be compensated equitably, and retain the right to withdraw their ratings from the dataset. Evolving Food and Drug Administration (FDA) guidance on training data provenance under the Software as a Medical Device (SaMD) framework should be monitored for additional requirements as it develops. Documentation of consent procedures, including the current technical limitations of consent withdrawal from deployed models, should be included in the transparency disclosures required elsewhere in this framework.</p>
      </sec>
      <sec>
        <title>Governance and Continuous Monitoring</title>
        <p>A governance structure for AI models should be based on multidisciplinary committees including clinicians, ethicists, data scientists, and patient representatives. This allows for ongoing ethical review with diverse stakeholder input and mandates human oversight for all AI-driven recommendations, requiring clinician review of model outputs before they inform patient consultations or treatment planning. To ensure regulatory compliance, developers should align with the FDA’s SaMD framework [<xref ref-type="bibr" rid="ref55">55</xref>] and remain aware of evolving federal regulations for AI in health care.</p>
        <p>Bias in the human–AI decision system extends beyond model development to the point of clinical use. Even a well-calibrated, fairness-aware model can produce inequitable outcomes if clinicians or institutions apply its outputs selectively or inconsistently across patient groups. Research on physician implicit bias [<xref ref-type="bibr" rid="ref56">56</xref>] suggests that providers may differentially override algorithmic recommendations based on patient demographics, accepting recommendations for patients who resemble the provider’s implicit reference population while discounting them for others. Governance structures should include mandatory documentation of AI recommendation acceptance or rejection by patient demographic category, regular review of override rates stratified by patient race, ethnicity, gender, age, and language, and structured clinician training on the mechanisms of implicit bias in AI-assisted decision-making. Where systematic override disparities are detected, the governance committee should determine whether the source is model error for specific subgroups or clinician bias in interpretation. Fairness in aesthetic AI, therefore, requires monitoring both algorithmic outputs and human responses to those outputs.</p>
        <p>Let drift be a statistically significant change, exceeding prespecified control limits, in 1 or more of the following: prediction accuracy by demographic subgroup, fairness metric values (demographic parity, equalized odds, cultural concordance), or the distribution of model inputs relative to the training distribution. Demographic-subgroup-specific degradation that does not affect overall accuracy is a particularly important drift signal, as aggregate metrics can mask emerging disparities. Institutional responsibility for drift response should be assigned explicitly at deployment: a designated AI clinical lead bears primary responsibility for reviewing automated alerts, convening the multidisciplinary governance committee, and authorizing remediation. Triggered remediation actions should follow a tiered protocol keyed to severity: minor drift triggers increased monitoring frequency and a targeted data audit; moderate drift triggers model recalibration or posttraining correction without full retraining; severe drift, including any demographic subgroup falling below institutionally defined minimum performance thresholds, triggers suspension of AI-assisted outputs for affected use cases pending full model retraining and revalidation. The specific thresholds delineating these severity tiers should be defined prospectively by each deploying institution based on clinical context, use case stakes, and available monitoring infrastructure, rather than adopted from universal benchmarks for which no empirical basis currently exists in aesthetic AI. All drift events and remediation actions should be documented in an institutional AI governance log and reported in periodic transparency disclosures.</p>
        <p>Postdeployment monitoring of the AI/ML models should include continuous tracking of prediction accuracy, fairness metrics across demographic groups, and concordance with clinical judgment. A tiered monitoring approach is recommended. Continuous monitoring using statistical process control on prespecified fairness metrics, with automated alerts when metrics exceed defined control limits described above, provides the first line of detection. Quarterly structured reviews should assess fairness metric trends and flag emerging disparities for clinical review. An annual deep audit evaluates model architecture, training data composition, rater panel diversity, and alignment with updated regulatory guidance. Where continuous monitoring infrastructure is not feasible, drift-triggered audits, initiated automatically when prediction distributions shift beyond a prespecified threshold, represent a minimum acceptable alternative. The appropriate monitoring intensity scales with deployment volume: high-volume systems serving diverse patient populations require continuous monitoring; lower-volume or single-institution implementations may operate on a quarterly plus annual cycle with drift-triggered escalation. When audits identify performance disparities exceeding 5% variance across demographic groups, retraining (or appropriate posttraining) should be initiated. Feedback loops integrating clinician and patient input, as well as regular retraining cycles incorporating new, diverse data, allow for ongoing model improvement.</p>
        <p>Ultimately, this framework is directed primarily at academic and institutional developers and is intended as aspirational guidance and input to regulatory deliberation. Most currently deployed aesthetic AI tools are commercial, including consumer-facing filters, practice-management platforms, and direct-to-consumer assessment applications, and fall outside the scope of institutional governance structures. We recommend that regulatory bodies consider disclosure-based accountability as a lighter-touch regulatory instrument: commercial developers would publicly document which framework components their systems satisfy, analogous to transparency requirements in other regulated industries, enabling clinicians and patients to assess compliance. We acknowledge that even this approach requires formal regulatory action and cannot be implemented through voluntary adoption alone. Long-term, enforcement mechanisms aligned with the FDA’s evolving SaMD framework and equivalent international regulations will be necessary to extend these standards to commercial systems.</p>
      </sec>
    </sec>
    <sec>
      <title>Limitations and Unresolved Challenges</title>
      <p>This framework addresses critical biases in aesthetic AI training but faces implementation challenges in operationalizing the distinction between universal and culture-specific aesthetic features. While empirical evidence establishes that certain features (averageness, femininity in female faces) demonstrate cross-cultural appeal, whereas others (skin coloration emphasis, specific feature preferences) vary by cultural context, translating this nuance into algorithmic systems remains complex. Hierarchical approaches, including hierarchical Bayesian models, partial invariance, and multigroup equivariant techniques [<xref ref-type="bibr" rid="ref50">50</xref>,<xref ref-type="bibr" rid="ref51">51</xref>,<xref ref-type="bibr" rid="ref57">57</xref>], theoretically offer a middle ground by encoding universal structural principles in base layers while allowing culture-specific parameters for regionally variable features. However, practical deployment carries risks: explicitly categorizing training data by ethnicity may essentialize group-level patterns, calcifying what constitutes “Asian beauty” or “African beauty” rather than honoring individual variation within cultural communities. Moreover, broad ethnic categories obscure meaningful contextual differences; for instance, North Korean, South Korean, Korean-American, and Korean-Canadian individuals may hold divergent aesthetic preferences despite shared ethnic heritage, yet training data rarely capture this granularity. The framework’s emphasis on continuous monitoring and stakeholder feedback provides mechanisms to detect such unintended consequences, but cannot eliminate these tensions entirely.</p>
      <p>Technical limitations constrain practical implementation. As noted, fairness metrics can mathematically conflict, and when improving performance for one demographic group worsens outcomes for another, the framework provides insufficient guidance on prioritization. Intersectional assessment becomes computationally prohibitive when evaluating all meaningful combinations of race, age, gender, and other attributes, requiring sample sizes that may not exist for rare intersectional categories. Most proposed techniques have been validated in isolation rather than as an integrated system, creating uncertainty about their combined effectiveness.</p>
      <p>The framework may create unintended harms despite ethical intentions. Formalizing cultural aesthetic standards into training data risks reifying what should remain individually variable. Overreliance on AI-assisted evaluation may affect clinicians’ ability to make nuanced judgments. The resource-intensive requirements, including multidisciplinary committees, annual audits, and continuous monitoring, may be feasible only for well-funded institutions, potentially widening disparities between elite centers and community practices that either avoid AI tools entirely or use inadequately validated commercial systems.</p>
      <p>Despite these significant limitations, this framework represents the most viable approach to addressing documented harms in current systems. Culture-neutral algorithms trained on predominantly Western datasets demonstrably perpetuate Eurocentric beauty standards and generate systematically higher error rates for underrepresented populations [<xref ref-type="bibr" rid="ref4">4</xref>,<xref ref-type="bibr" rid="ref14">14</xref>,<xref ref-type="bibr" rid="ref35">35</xref>]—creating risk that algorithmic recommendations may conflict with patient goals of ethnic feature preservation [<xref ref-type="bibr" rid="ref37">37</xref>,<xref ref-type="bibr" rid="ref38">38</xref>]. Intentionally encoding cultural awareness during training is preferable to allowing implicit Western bias to persist unchecked, and the framework’s emphasis on continuous monitoring and stakeholder engagement provides mechanisms for identifying and correcting unintended consequences as they emerge, making this an iterative rather than static solution. Bias mitigation in aesthetic AI remains an evolving challenge requiring ongoing research, stakeholder dialogue, and willingness to revise approaches as evidence accumulates.</p>
    </sec>
    <sec>
      <title>Implementation Considerations</title>
      <p>Implementing this framework requires strategic approaches to address inherent complexities in developing fair and accurate aesthetic evaluation systems. A phased implementation strategy allows institutions to prioritize components based on available resources, with initial phases focusing on data diversification and basic fairness metrics before advancing to sophisticated techniques such as meta-learning and intersectional assessment.</p>
      <p>Addressing data limitations requires combining synthetic data generation, transfer learning methods, and multi-institutional collaboration. Pooling datasets across institutions expands demographic coverage while distributing resource burdens and facilitating industry standards for dataset requirements, fairness thresholds, and validation protocols. Federated learning, established in medical imaging and adapted for aesthetic AI applications, provides a privacy-preserving framework that enables multi-institutional and multinational collaboration while maintaining HIPAA (Health Insurance Portability and Accountability Act) compliance. In this approach, each institution trains models on its local dataset without sharing raw patient images; only model parameters are transmitted between sites, addressing both regulatory requirements and the need for demographically diverse training data. This decentralized architecture has been successfully demonstrated in medical imaging applications, showing that models trained via federated learning can achieve comparable or superior performance to centralized training while preserving patient privacy [<xref ref-type="bibr" rid="ref58">58</xref>]. However, vanilla federated learning does not guarantee privacy: gradient leakage, membership inference attacks, and model inversion techniques can reconstruct sensitive features of training images from transmitted model parameters. Clinically deployable federated systems, therefore, require additional mitigations: differential privacy limits individual-level information leakage, secure aggregation protocols ensure that parameter updates are aggregated without exposing individual site contributions, and homomorphic encryption may be warranted in high-risk deployments involving sensitive biometric data. HIPAA compliance of a federated system depends on these additional safeguards, not on the federated architecture alone, and implementers should document which mitigations are in place as part of their data governance and regulatory submissions.</p>
      <p>Multisite data acquisition requires harmonization protocols that go beyond technical compatibility. Participating institutions should adopt a shared data dictionary defining demographic subgroup categories, attractiveness rating scales, and imaging standards before data collection begins; naive pooling of heterogeneous datasets across sites can amplify distributional asymmetries and yield biased estimators, and post hoc harmonization of inconsistently defined variables compounds this risk [<xref ref-type="bibr" rid="ref59">59</xref>]. Subgroup definitions should be governed by a standing dataset governance committee with representation from each participating institution and from community members of the populations being represented; this committee should have authority to revise subgroup definitions as evidence about their validity accumulates. Long-term dataset maintenance requires designated institutional roles: a data steward responsible for tracking consent status and honoring withdrawal requests, a technical curator responsible for version control and documentation of any dataset changes, and a scientific lead responsible for periodic assessment of whether the dataset’s demographic composition remains representative of the clinical population it is intended to serve. Datasets should be versioned explicitly so that fairness audits can be traced to the data on which a model was trained. Multisite data sharing agreements should specify data retention limits, destruction protocols, and procedures for incorporating newly consented data into existing pipelines without reintroducing batch effects.</p>
      <p>The selection of fairness metrics presents practical trade-offs between comprehensiveness and computational efficiency. Given that simultaneous optimization is mathematically impossible, developers should understand that different metrics detect distinct types of bias. For facial aesthetic evaluation, using multiple complementary metrics is essential: demographic parity identifies whether the model systematically undervalues certain demographic groups, equal opportunity detects failures to recognize attractive features in underrepresented populations, and equalized odds ensures accuracy parity across all groups. While resource constraints may require prioritizing certain metrics during initial implementation, comprehensive evaluation across multiple fairness dimensions should remain the long-term goal. Metric prioritization should be calibrated to the clinical use case. For example, a preoperative planning tool used across a demographically diverse practice should prioritize equalized odds, ensuring that error rates in feature recognition are equivalent across ethnic groups, because differential accuracy directly affects surgical decision-making. By contrast, an outcome simulation tool used in patient counseling might prioritize equal opportunity, ensuring that attractive features in underrepresented groups are not systematically missed or underrendered, because the primary harm is failure to surface positive options rather than differential error rates. A research benchmarking tool, with lower direct clinical stakes, might accept demographic parity as a sufficient initial standard while longitudinal validation data are collected.</p>
    </sec>
    <sec>
      <title>Conclusion</title>
      <p>The integration of AI/ML into aesthetic facial evaluation presents both opportunity and risk. Without intentional intervention, algorithmic systems will perpetuate Eurocentric beauty standards, generate higher error rates for underrepresented populations, and risk aesthetic homogenization. This framework provides comprehensive bias mitigation through 6 interconnected pillars, from diverse data collection and fairness-aware training to XAI, stakeholder engagement, and continuous monitoring (<xref rid="figure1" ref-type="fig">Figure 1</xref>).</p>
      <fig id="figure1" position="float">
        <label>Figure 1</label>
        <caption>
          <p>The 6-pillar framework for bias mitigation in artificial intelligence/machine learning (AI/ML) models for aesthetic facial evaluation. Visual overview of the proposed framework spanning the full AI development lifecycle. The six pillars are: (1) diverse data collection with synthetic augmentation, (2) fairness-aware training techniques, (3) complementary fairness metrics with intersectional assessment, (4) explainable AI for clinical transparency, (5) stakeholder engagement and participatory design, and (6) governance and continuous monitoring. Arrows indicate the interdependent nature of the pillars across the development lifecycle.</p>
        </caption>
        <graphic xlink:href="jmir_v28i1e95452_fig1.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
      </fig>
      <p>The framework mitigates the tension between universal aesthetic principles and cultural specificity rather than fully resolving it; the mathematical conflict between optimizing for averageness and preserving ethnic distinctiveness remains an open research problem requiring empirical validation of hierarchical architectural approaches. Within these constraints, the framework positions diverse stakeholders, including patients, clinicians, cultural consultants, and ethicists, as essential collaborators whose input shapes algorithm development and refinement.</p>
      <p>Significant challenges remain: tensions between algorithmic objectivity and cultural subjectivity may not be fully resolved, fairness metrics may conflict, and resource-intensive implementation may widen institutional disparities. Despite these limitations, this framework represents a necessary step toward ethical AI development in aesthetic medicine, providing actionable guidance for developers, clinicians, and institutions committed to equitable care.</p>
      <p>For this framework to move from proposal to validated practice, concrete validation end points must be defined. Prospective comparative studies should measure prespecified fairness metrics, such as demographic parity, equalized odds, and cultural concordance, in framework-compliant systems versus uncontrolled alternatives. Patient-reported outcomes measuring satisfaction with ethnic feature preservation should serve as a primary clinical end point, given that algorithmic recommendations ultimately affect patient goals and identity. Clinician agreement studies should assess whether XAI-explained outputs align with expert aesthetic judgment across demographic subgroups. Finally, longitudinal drift detection benchmarks should evaluate whether fairness gains are maintained as models are retrained on new data. Randomized deployment designs, in which framework-compliant and noncompliant systems are compared in parallel with patient consent, would provide the strongest evidence but raise practical and ethical challenges requiring dedicated methodological attention.</p>
      <p>As AI capabilities advance, ongoing research must address operationalizing cultural appropriateness, validating integrated bias mitigation techniques, and ensuring technological progress serves patient-centered, culturally responsive care. This broader implementation challenge is consistent with recent work emphasizing that AI adoption in health care requires structured implementation pathways and explicit risk mitigation strategies rather than relying on technical advancement alone [<xref ref-type="bibr" rid="ref60">60</xref>]. These principles extend beyond aesthetic surgery to any facial analysis AI application, establishing foundations for fair and transparent algorithmic systems across diverse clinical contexts.</p>
    </sec>
  </body>
  <back>
    <app-group/>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">AI/ML</term>
          <def>
            <p>artificial intelligence/machine learning</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb2">AI</term>
          <def>
            <p>artificial intelligence</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb3">CNN</term>
          <def>
            <p>convolutional neural network</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb4">FDA</term>
          <def>
            <p>Food and Drug Administration</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb5">Grad-CAM</term>
          <def>
            <p>gradient-weighted class activation mapping</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb6">HIPAA</term>
          <def>
            <p>Health Insurance Portability and Accountability Act</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb7">LIME</term>
          <def>
            <p>local interpretable model-agnostic explanations</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb8">ML</term>
          <def>
            <p>machine learning</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb9">MTL</term>
          <def>
            <p>multitask learning</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb10">NIST</term>
          <def>
            <p>National Institute of Standards and Technology</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb11">SaMD</term>
          <def>
            <p>Software as a Medical Device</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb12">SCUT-FBP</term>
          <def>
            <p>South China University of Technology—Facial Beauty Prediction</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb13">SHAP</term>
          <def>
            <p>Shapley additive explanations</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb14">XAI</term>
          <def>
            <p>explainable AI</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <ack>
      <p>The authors did not use generative artificial intelligence (AI) tools or technologies in the preparation of this manuscript.</p>
    </ack>
    <notes>
      <title>Data Availability</title>
      <p>Data sharing is not applicable to this article as no data sets were generated or analyzed during this study.</p>
    </notes>
    <notes>
      <title>Funding</title>
      <p>The authors declared no financial support was received for this work.</p>
    </notes>
    <fn-group>
      <fn fn-type="con">
        <p>ARK contributed to conceptualization, methodology, writing of the original draft, and writing—review and editing. LRV contributed to conceptualization, writing—review and editing, and supervision.</p>
      </fn>
      <fn fn-type="conflict">
        <p>None declared.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Fortune-Ely</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Achanta</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Song</surname>
              <given-names>MSH</given-names>
            </name>
          </person-group>
          <article-title>The future of artificial intelligence in facial plastic surgery</article-title>
          <source>JPRAS Open</source>
          <year>2024</year>
          <volume>39</volume>
          <fpage>89</fpage>
          <lpage>92</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://linkinghub.elsevier.com/retrieve/pii/S2352-5878(23)00095-5"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.jpra.2023.11.016</pub-id>
          <pub-id pub-id-type="medline">38186379</pub-id>
          <pub-id pub-id-type="pii">S2352-5878(23)00095-5</pub-id>
          <pub-id pub-id-type="pmcid">PMC10770469</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ghasemi</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Dashti</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Artificial intelligence and deep learning in preservation rhinoplasty: a review</article-title>
          <source>Am J Cosmet Surg</source>
          <year>2024</year>
          <volume>41</volume>
          <issue>4</issue>
          <fpage>225</fpage>
          <lpage>229</lpage>
          <pub-id pub-id-type="doi">10.1177/07488068231224133</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Park</surname>
              <given-names>KW</given-names>
            </name>
            <name name-style="western">
              <surname>Diop</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Willens</surname>
              <given-names>SH</given-names>
            </name>
            <name name-style="western">
              <surname>Pepper</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Artificial intelligence in facial plastics and reconstructive surgery</article-title>
          <source>Otolaryngol Clin North Am</source>
          <year>2024</year>
          <volume>57</volume>
          <issue>5</issue>
          <fpage>843</fpage>
          <lpage>852</lpage>
          <pub-id pub-id-type="doi">10.1016/j.otc.2024.05.002</pub-id>
          <pub-id pub-id-type="medline">38971626</pub-id>
          <pub-id pub-id-type="pii">S0030-6665(24)00071-9</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Makhseed</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Arian</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Shuaib</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Beyond the algorithm: a perspective on tackling bias and cultural sensitivity in AI-guided aesthetic standards for cosmetic surgery in the middle east and North Africa (MENA) region</article-title>
          <source>Clin Cosmet Investig Dermatol</source>
          <year>2025</year>
          <volume>18</volume>
          <fpage>2173</fpage>
          <lpage>2182</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.2147/CCID.S543045"/>
          </comment>
          <pub-id pub-id-type="doi">10.2147/CCID.S543045</pub-id>
          <pub-id pub-id-type="medline">40927497</pub-id>
          <pub-id pub-id-type="pii">543045</pub-id>
          <pub-id pub-id-type="pmcid">PMC12416507</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Goshtasbi</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Hakimi</surname>
              <given-names>AA</given-names>
            </name>
            <name name-style="western">
              <surname>Wong</surname>
              <given-names>BJ</given-names>
            </name>
          </person-group>
          <article-title>Artificial intelligence versus human focus group rating of facial attractiveness</article-title>
          <source>Facial Plast Surg Aesthet Med</source>
          <year>2024</year>
          <volume>26</volume>
          <issue>4</issue>
          <fpage>371</fpage>
          <lpage>376</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://escholarship.org/uc/item/qt9573t2xh"/>
          </comment>
          <pub-id pub-id-type="doi">10.1089/fpsam.2023.0281</pub-id>
          <pub-id pub-id-type="medline">38377584</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref6">
        <label>6</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Laurentini</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Bottino</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Computer analysis of face beauty: a survey</article-title>
          <source>Comput Vis Image Und</source>
          <year>2014</year>
          <volume>125</volume>
          <fpage>184</fpage>
          <lpage>199</lpage>
          <pub-id pub-id-type="doi">10.1016/j.cviu.2014.04.006</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref7">
        <label>7</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kleisner</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Tureček</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Saribay</surname>
              <given-names>SA</given-names>
            </name>
            <name name-style="western">
              <surname>Pavlovič</surname>
              <given-names>O</given-names>
            </name>
            <name name-style="western">
              <surname>Leongómez</surname>
              <given-names>JD</given-names>
            </name>
            <name name-style="western">
              <surname>Roberts</surname>
              <given-names>SC</given-names>
            </name>
          </person-group>
          <article-title>Distinctiveness and femininity, rather than symmetry and masculinity, affect facial attractiveness across the world</article-title>
          <source>Evol Hum Behav</source>
          <year>2024</year>
          <volume>45</volume>
          <issue>1</issue>
          <fpage>82</fpage>
          <lpage>90</lpage>
          <pub-id pub-id-type="doi">10.1016/j.evolhumbehav.2023.10.001</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref8">
        <label>8</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lu</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Xiao</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Pointer</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Wuerger</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Skin coloration is a culturally-specific cue for attractiveness, healthiness, and youthfulness in observers of Chinese and western European descent</article-title>
          <source>PLoS One</source>
          <year>2021</year>
          <volume>16</volume>
          <issue>10</issue>
          <fpage>e0259276</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://dx.plos.org/10.1371/journal.pone.0259276"/>
          </comment>
          <pub-id pub-id-type="doi">10.1371/journal.pone.0259276</pub-id>
          <pub-id pub-id-type="medline">34710190</pub-id>
          <pub-id pub-id-type="pii">PONE-D-21-13133</pub-id>
          <pub-id pub-id-type="pmcid">PMC8553160</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref9">
        <label>9</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Sano</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Kawabata</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>Cultural and gender influences on facial attractiveness: a comparative study of Japanese and American raters using geometric morphometrics</article-title>
          <source>Psych J</source>
          <year>2026</year>
          <volume>15</volume>
          <issue>1</issue>
          <fpage>e70065</fpage>
          <pub-id pub-id-type="doi">10.1002/pchj.70065</pub-id>
          <pub-id pub-id-type="medline">41239838</pub-id>
          <pub-id pub-id-type="pmcid">PMC12856234</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref10">
        <label>10</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Pavlovič</surname>
              <given-names>O</given-names>
            </name>
            <name name-style="western">
              <surname>Fiala</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Kleisner</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>Congruence in European and Asian perception of Vietnamese facial attractiveness, averageness, symmetry and sexual dimorphism</article-title>
          <source>Sci Rep</source>
          <year>2023</year>
          <volume>13</volume>
          <issue>1</issue>
          <fpage>13320</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1038/s41598-023-40458-1"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/s41598-023-40458-1</pub-id>
          <pub-id pub-id-type="medline">37587194</pub-id>
          <pub-id pub-id-type="pii">10.1038/s41598-023-40458-1</pub-id>
          <pub-id pub-id-type="pmcid">PMC10432390</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref11">
        <label>11</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Arian</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Alroudan</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Alkandari</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Shuaib</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Cosmetic surgery and the diversity of cultural and ethnic perceptions of facial, breast, and gluteal aesthetics in women: a comprehensive review</article-title>
          <source>Clin Cosmet Investig Dermatol</source>
          <year>2023</year>
          <volume>16</volume>
          <fpage>1443</fpage>
          <lpage>1456</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.tandfonline.com/doi/10.2147/CCID.S410621?url_ver=Z39.88-2003&#38;rfr_id=ori:rid:crossref.org&#38;rfr_dat=cr_pub  0pubmed"/>
          </comment>
          <pub-id pub-id-type="doi">10.2147/CCID.S410621</pub-id>
          <pub-id pub-id-type="medline">37313510</pub-id>
          <pub-id pub-id-type="pii">410621</pub-id>
          <pub-id pub-id-type="pmcid">PMC10258039</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref12">
        <label>12</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Pozzi</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Fàdel</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Bolletta</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Cuomo</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Roxo</surname>
              <given-names>CW</given-names>
            </name>
          </person-group>
          <article-title>Ethnic rhinoplasty: preliminary results of our technique in the pursuit of the harmonious nose</article-title>
          <source>J Plast Reconstr Aesthet Surg</source>
          <year>2023</year>
          <volume>87</volume>
          <fpage>135</fpage>
          <lpage>146</lpage>
          <pub-id pub-id-type="doi">10.1016/j.bjps.2023.09.036</pub-id>
          <pub-id pub-id-type="medline">37839388</pub-id>
          <pub-id pub-id-type="pii">S1748-6815(23)00537-5</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref13">
        <label>13</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lam</surname>
              <given-names>SM</given-names>
            </name>
          </person-group>
          <article-title>Asian rhinoplasty</article-title>
          <source>Semin Plast Surg</source>
          <year>2009</year>
          <volume>23</volume>
          <issue>3</issue>
          <fpage>215</fpage>
          <lpage>222</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/20676316"/>
          </comment>
          <pub-id pub-id-type="doi">10.1055/s-0029-1224801</pub-id>
          <pub-id pub-id-type="medline">20676316</pub-id>
          <pub-id pub-id-type="pmcid">PMC2884923</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref14">
        <label>14</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Buolamwini</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Gebru</surname>
              <given-names>T</given-names>
            </name>
          </person-group>
          <article-title>Gender shades: intersectional accuracy disparities in commercial gender classification</article-title>
          <source>Proc Mach Learn Res</source>
          <year>2018</year>
          <volume>81</volume>
          <fpage>77</fpage>
          <lpage>91</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/30911652"/>
          </comment>
          <pub-id pub-id-type="doi">10.1515/pp-2018-0102</pub-id>
          <pub-id pub-id-type="medline">30911652</pub-id>
          <pub-id pub-id-type="pii">pp-pp-2018-0102</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref15">
        <label>15</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kenig</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Monton Echeverria</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Muntaner Vives</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Human beauty according to artificial intelligence</article-title>
          <source>Plast Reconstr Surg Glob Open</source>
          <year>2023</year>
          <volume>11</volume>
          <issue>7</issue>
          <fpage>e5153</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/37502224"/>
          </comment>
          <pub-id pub-id-type="doi">10.1097/GOX.0000000000005153</pub-id>
          <pub-id pub-id-type="medline">37502224</pub-id>
          <pub-id pub-id-type="pmcid">PMC10371313</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref16">
        <label>16</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Keskinbora</surname>
              <given-names>KH</given-names>
            </name>
          </person-group>
          <article-title>Medical ethics considerations on artificial intelligence</article-title>
          <source>J Clin Neurosci</source>
          <year>2019</year>
          <volume>64</volume>
          <fpage>277</fpage>
          <lpage>282</lpage>
          <pub-id pub-id-type="doi">10.1016/j.jocn.2019.03.001</pub-id>
          <pub-id pub-id-type="medline">30878282</pub-id>
          <pub-id pub-id-type="pii">S0967-5868(19)30025-6</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref17">
        <label>17</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Moridani</surname>
              <given-names>MK</given-names>
            </name>
            <name name-style="western">
              <surname>Jamiee</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Saghafi</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Human-like evaluation by facial attractiveness intelligent machine</article-title>
          <source>Int J Cogn Comput Eng</source>
          <year>2023</year>
          <volume>4</volume>
          <fpage>160</fpage>
          <lpage>169</lpage>
          <pub-id pub-id-type="doi">10.1016/j.ijcce.2023.04.001</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref18">
        <label>18</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Eisenthal</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Dror</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Ruppin</surname>
              <given-names>E</given-names>
            </name>
          </person-group>
          <article-title>Facial attractiveness: beauty and the machine</article-title>
          <source>Neural Comput</source>
          <year>2006</year>
          <volume>18</volume>
          <issue>1</issue>
          <fpage>119</fpage>
          <lpage>142</lpage>
          <pub-id pub-id-type="doi">10.1162/089976606774841602</pub-id>
          <pub-id pub-id-type="medline">16354383</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref19">
        <label>19</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kagian</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Dror</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Leyvand</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Meilijson</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Cohen-Or</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Ruppin</surname>
              <given-names>E</given-names>
            </name>
          </person-group>
          <article-title>A machine learning predictor of facial attractiveness revealing human-like psychophysical biases</article-title>
          <source>Vision Res</source>
          <year>2008</year>
          <volume>48</volume>
          <issue>2</issue>
          <fpage>235</fpage>
          <lpage>243</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://linkinghub.elsevier.com/retrieve/pii/S0042-6989(07)00503-2"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.visres.2007.11.007</pub-id>
          <pub-id pub-id-type="medline">18164363</pub-id>
          <pub-id pub-id-type="pii">S0042-6989(07)00503-2</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref20">
        <label>20</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Gunes</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Piccardi</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Assessing facial beauty through proportion analysis by image processing and supervised learning</article-title>
          <source>Int J Hum Comput Stud</source>
          <year>2006</year>
          <volume>64</volume>
          <issue>12</issue>
          <fpage>1184</fpage>
          <lpage>1199</lpage>
          <pub-id pub-id-type="doi">10.1016/j.ijhcs.2006.07.004</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref21">
        <label>21</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Xie</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Liang</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Jin</surname>
              <given-names>L</given-names>
            </name>
          </person-group>
          <article-title>SCUT-FBP: a benchmark dataset for facial beauty perception</article-title>
          <year>2015</year>
          <conf-name>Proceedings of the  IEEE International Conference on Systems, Man, and Cybernetics (SMC)</conf-name>
          <conf-date>October 9-12, 2015</conf-date>
          <conf-loc>Kowloon</conf-loc>
          <fpage>1821</fpage>
          <lpage>1826</lpage>
          <pub-id pub-id-type="doi">10.1109/smc.2015.319</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref22">
        <label>22</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Fan</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Guo</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Samal</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Wan</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>SZ</given-names>
            </name>
          </person-group>
          <article-title>Label distribution-based facial attractiveness computation by deep residual learning</article-title>
          <source>IEEE Trans Multimedia</source>
          <year>2018</year>
          <volume>20</volume>
          <issue>8</issue>
          <fpage>2196</fpage>
          <lpage>2208</lpage>
          <pub-id pub-id-type="doi">10.1109/tmm.2017.2780762</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref23">
        <label>23</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hu</surname>
              <given-names>Y</given-names>
            </name>
          </person-group>
          <article-title>Toward retrieval-grounded evaluation for conversational large language model-based risk assessment</article-title>
          <source>JMIR AI</source>
          <year>2026</year>
          <volume>5</volume>
          <fpage>e90759</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://ai.jmir.org/2026//e90759/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/90759</pub-id>
          <pub-id pub-id-type="medline">41818631</pub-id>
          <pub-id pub-id-type="pii">v5i1e90759</pub-id>
          <pub-id pub-id-type="pmcid">PMC12981538</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref24">
        <label>24</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ibrahem</surname>
              <given-names>AH</given-names>
            </name>
            <name name-style="western">
              <surname>Abdulazeez</surname>
              <given-names>AM</given-names>
            </name>
          </person-group>
          <article-title>A comprehensive review of facial beauty prediction using multi-task learning and facial attributes</article-title>
          <source>ARO</source>
          <year>2025</year>
          <volume>13</volume>
          <issue>1</issue>
          <fpage>10</fpage>
          <lpage>21</lpage>
          <pub-id pub-id-type="doi">10.14500/aro.11850</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref25">
        <label>25</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Boukhari</surname>
              <given-names>DE</given-names>
            </name>
          </person-group>
          <article-title>VM-beautyNet: a synergistic ensemble of vision transformer and mamba for facial beauty prediction</article-title>
          <source>ArXiv. Preprint posted online on October 17, 2025</source>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/2510.16220"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref26">
        <label>26</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Liang</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Lin</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Jin</surname>
              <given-names>L</given-names>
            </name>
          </person-group>
          <article-title>SCUT-FBP5500: A diverse benchmark dataset for multi-paradigm facial beauty prediction</article-title>
          <year>2018</year>
          <conf-name>Proceedings of the 24th International Conference on Pattern Recognition (ICPR)</conf-name>
          <conf-date>August 20-24, 2018</conf-date>
          <conf-loc>Beijing</conf-loc>
          <pub-id pub-id-type="doi">10.1109/icpr.2018.8546038</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref27">
        <label>27</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ma</surname>
              <given-names>DS</given-names>
            </name>
            <name name-style="western">
              <surname>Correll</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Wittenbrink</surname>
              <given-names>B</given-names>
            </name>
          </person-group>
          <article-title>The Chicago Face Database: a free stimulus set of faces and norming data</article-title>
          <source>Behav Res Methods</source>
          <year>2015</year>
          <volume>47</volume>
          <issue>4</issue>
          <fpage>1122</fpage>
          <lpage>1135</lpage>
          <pub-id pub-id-type="doi">10.3758/s13428-014-0532-5</pub-id>
          <pub-id pub-id-type="medline">25582810</pub-id>
          <pub-id pub-id-type="pii">10.3758/s13428-014-0532-5</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref28">
        <label>28</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Huang</surname>
              <given-names>GB</given-names>
            </name>
            <name name-style="western">
              <surname>Mattar</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Berg</surname>
              <given-names>T</given-names>
            </name>
          </person-group>
          <article-title>Labeled faces in the wild: a database for studying face recognition in unconstrained environments</article-title>
          <source>HAL Open Science</source>
          <year>2008</year>
          <access-date>2026-06-03</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://inria.hal.science/inria-00321923v1">https://inria.hal.science/inria-00321923v1</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref29">
        <label>29</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lebedeva</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Guo</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Ying</surname>
              <given-names>F</given-names>
            </name>
          </person-group>
          <article-title>MEBeauty: a multi-ethnic facial beauty dataset in-the-wild</article-title>
          <source>Neural Comput Appl</source>
          <year>2021</year>
          <volume>34</volume>
          <issue>17</issue>
          <fpage>14169</fpage>
          <lpage>14183</lpage>
          <pub-id pub-id-type="doi">10.1007/s00521-021-06535-0</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref30">
        <label>30</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Obwegeser</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Timofte</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Mayer</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Bornstein</surname>
              <given-names>MM</given-names>
            </name>
            <name name-style="western">
              <surname>Schätzle</surname>
              <given-names>MA</given-names>
            </name>
            <name name-style="western">
              <surname>Patcas</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>Scoring facial attractiveness with deep convolutional neural networks: how training on standardized images reduces the bias of facial expressions</article-title>
          <source>Orthod Craniofac Res</source>
          <year>2024</year>
          <volume>27 Suppl 2</volume>
          <issue>Suppl 2</issue>
          <fpage>25</fpage>
          <lpage>32</lpage>
          <pub-id pub-id-type="doi">10.1111/ocr.12820</pub-id>
          <pub-id pub-id-type="medline">38825845</pub-id>
          <pub-id pub-id-type="pmcid">PMC11654357</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref31">
        <label>31</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kärkkäinen</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Joo</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>FairFace: face attribute dataset for balanced race, gender, and age for bias measurement and mitigation</article-title>
          <year>2021</year>
          <conf-name>2021 IEEE Winter Conference on Applications of Computer Vision (WACV)</conf-name>
          <conf-date>January 3-8, 2021</conf-date>
          <conf-loc>Waikoloa</conf-loc>
          <fpage>1548</fpage>
          <lpage>1558</lpage>
          <pub-id pub-id-type="doi">10.1109/wacv48630.2021.00159</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref32">
        <label>32</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Merler</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Ratha</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Feris</surname>
              <given-names>RS</given-names>
            </name>
            <name name-style="western">
              <surname>Smith</surname>
              <given-names>JR</given-names>
            </name>
          </person-group>
          <article-title>Diversity in faces</article-title>
          <source>ArXiv. Preprint posted online on April 8, 2019</source>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/1901.10436"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref33">
        <label>33</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hardt</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Mendler-Dünner</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>Performative prediction: past and future</article-title>
          <source>Statist Sci</source>
          <year>2025</year>
          <volume>40</volume>
          <issue>3</issue>
          <fpage>417</fpage>
          <lpage>436</lpage>
          <pub-id pub-id-type="doi">10.1214/25-sts986</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref34">
        <label>34</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Georgievskaya</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Tlyachev</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Danko</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Chekanov</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Corstjens</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>How artificial intelligence adopts human biases: the case of cosmetic skincare industry</article-title>
          <source>AI Ethics</source>
          <year>2023</year>
          <volume>5</volume>
          <issue>1</issue>
          <fpage>105</fpage>
          <lpage>115</lpage>
          <pub-id pub-id-type="doi">10.1007/s43681-023-00378-2</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref35">
        <label>35</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Grother</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Ngan</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Hanaoka</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <source>Face Recognition Vendor Test (FRVT) Part 3: Demographic Effects</source>
          <year>2019</year>
          <publisher-loc>Gaithersburg</publisher-loc>
          <publisher-name>National Institute of Standards and Technology</publisher-name>
        </nlm-citation>
      </ref>
      <ref id="ref36">
        <label>36</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Georgopoulos</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Oldfield</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Nicolaou</surname>
              <given-names>MA</given-names>
            </name>
            <name name-style="western">
              <surname>Panagakis</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Pantic</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Mitigating demographic bias in facial datasets with style-based multi-attribute transfer</article-title>
          <source>Int J Comput Vis</source>
          <year>2021</year>
          <volume>129</volume>
          <issue>7</issue>
          <fpage>2288</fpage>
          <lpage>2307</lpage>
          <pub-id pub-id-type="doi">10.1007/s11263-021-01448-w</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref37">
        <label>37</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Maassarani</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Challita</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Zeaiter</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Chbib</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Chamy</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Farfour</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Ghanime</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Sleiman</surname>
              <given-names>Z</given-names>
            </name>
          </person-group>
          <article-title>Ethnic rhinoplasty: a middle east-centered patient satisfaction survey using the FACE-Q questionnaire</article-title>
          <source>Cureus</source>
          <year>2023</year>
          <volume>15</volume>
          <issue>6</issue>
          <fpage>e40048</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/37425578"/>
          </comment>
          <pub-id pub-id-type="doi">10.7759/cureus.40048</pub-id>
          <pub-id pub-id-type="medline">37425578</pub-id>
          <pub-id pub-id-type="pmcid">PMC10324984</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref38">
        <label>38</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Cobo</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>Rhinoplasty considerations in the ethnic patient using a case-based approach: the latino patient</article-title>
          <source>Facial Plast Surg Clin North Am</source>
          <year>2022</year>
          <volume>30</volume>
          <issue>4</issue>
          <fpage>513</fpage>
          <lpage>520</lpage>
          <pub-id pub-id-type="doi">10.1016/j.fsc.2022.07.005</pub-id>
          <pub-id pub-id-type="medline">39492207</pub-id>
          <pub-id pub-id-type="pii">S1064-7406(22)00061-X</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref39">
        <label>39</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Soltan</surname>
              <given-names>AAS</given-names>
            </name>
            <name name-style="western">
              <surname>Eyre</surname>
              <given-names>DW</given-names>
            </name>
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Clifton</surname>
              <given-names>DA</given-names>
            </name>
          </person-group>
          <article-title>An adversarial training framework for mitigating algorithmic biases in clinical machine learning</article-title>
          <source>NPJ Digit Med</source>
          <year>2023</year>
          <volume>6</volume>
          <issue>1</issue>
          <fpage>55</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1038/s41746-023-00805-y"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/s41746-023-00805-y</pub-id>
          <pub-id pub-id-type="medline">36991077</pub-id>
          <pub-id pub-id-type="pii">10.1038/s41746-023-00805-y</pub-id>
          <pub-id pub-id-type="pmcid">PMC10050816</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref40">
        <label>40</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Conti</surname>
              <given-names>JC</given-names>
            </name>
            <name name-style="western">
              <surname>Clemencon</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Mitigating bias in facial recognition systems: centroid fairness loss optimization</article-title>
          <source>ArXiv. Preprint posted online on April 27, 2025</source>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/2504.19370"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref41">
        <label>41</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Deng</surname>
              <given-names>W</given-names>
            </name>
          </person-group>
          <article-title>Mitigate bias in face recognition using skewness-aware reinforcement learning</article-title>
          <source>ArXiv. Preprint posted online on November 25, 2019</source>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/1911.10692"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref42">
        <label>42</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Amigo</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Perea</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Marks</surname>
              <given-names>RJ</given-names>
            </name>
          </person-group>
          <article-title>Mitigating algorithmic bias on facial expression recognition</article-title>
          <source>ArXiv. Preprint posted online on December 23, 2023</source>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/2312.15307"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref43">
        <label>43</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lebedeva</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Ying</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Guo</surname>
              <given-names>Y</given-names>
            </name>
          </person-group>
          <article-title>Personalized facial beauty assessment: a meta-learning approach</article-title>
          <source>Vis Comput</source>
          <year>2022</year>
          <volume>39</volume>
          <issue>3</issue>
          <fpage>1095</fpage>
          <lpage>1107</lpage>
          <pub-id pub-id-type="doi">10.1007/s00371-021-02387-w</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref44">
        <label>44</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Muthukumar</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Pedapati</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Ratha</surname>
              <given-names>N</given-names>
            </name>
          </person-group>
          <article-title>Understanding unequal gender classification accuracy from face images</article-title>
          <source>ArXiv. Preprint posted online on November 30, 2018</source>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/1812.00099"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref45">
        <label>45</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hamel</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Busch</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Analysis of bias in deep learning facial beauty regressors</article-title>
          <source>ArXiv. Preprint posted online on September 29, 2025</source>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/2509.24138"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref46">
        <label>46</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Donia</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Shaw</surname>
              <given-names>JA</given-names>
            </name>
          </person-group>
          <article-title>Co-design and ethical artificial intelligence for health: an agenda for critical research and practice</article-title>
          <source>Big Data Soc</source>
          <year>2021</year>
          <volume>8</volume>
          <issue>2</issue>
          <fpage>205395172110652</fpage>
          <pub-id pub-id-type="doi">10.1177/20539517211065248</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref47">
        <label>47</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Ning</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Teixayavong</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Mertens</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Xu</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Ting</surname>
              <given-names>DSW</given-names>
            </name>
          </person-group>
          <article-title>A translational perspective towards clinical AI fairness</article-title>
          <source>NPJ Digit Med</source>
          <year>2023</year>
          <volume>6</volume>
          <issue>1</issue>
          <fpage>172</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1038/s41746-023-00918-4"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/s41746-023-00918-4</pub-id>
          <pub-id pub-id-type="medline">37709945</pub-id>
          <pub-id pub-id-type="pii">10.1038/s41746-023-00918-4</pub-id>
          <pub-id pub-id-type="pmcid">PMC10502051</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref48">
        <label>48</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Shorten</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Khoshgoftaar</surname>
              <given-names>TM</given-names>
            </name>
          </person-group>
          <article-title>A survey on image data augmentation for deep learning</article-title>
          <source>J Big Data</source>
          <year>2019</year>
          <volume>6</volume>
          <issue>1</issue>
          <pub-id pub-id-type="doi">10.1186/s40537-019-0197-0</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref49">
        <label>49</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Burke</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Nolan</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Hayward</surname>
              <given-names>WG</given-names>
            </name>
            <name name-style="western">
              <surname>Russell</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Sulikowski</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>Is there an own-race preference in attractiveness?</article-title>
          <source>Evol Psychol</source>
          <year>2013</year>
          <volume>11</volume>
          <issue>4</issue>
          <fpage>855</fpage>
          <lpage>872</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://journals.sagepub.com/doi/10.1177/147470491301100410?url_ver=Z39.88-2003&#38;rfr_id=ori:rid:crossref.org&#38;rfr_dat=cr_pub  0pubmed"/>
          </comment>
          <pub-id pub-id-type="doi">10.1177/147470491301100410</pub-id>
          <pub-id pub-id-type="medline">23948346</pub-id>
          <pub-id pub-id-type="pii">epjournal-3019</pub-id>
          <pub-id pub-id-type="pmcid">PMC10481032</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref50">
        <label>50</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Choraria</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Ferwana</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Mani</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Varshney</surname>
              <given-names>LR</given-names>
            </name>
          </person-group>
          <article-title>Learning optimal features via partial invariance</article-title>
          <source>Proceedings of the AAAI Conference on Artificial Intelligence</source>
          <year>2023</year>
          <volume>37</volume>
          <issue>6</issue>
          <fpage>7175</fpage>
          <lpage>7183</lpage>
          <pub-id pub-id-type="doi">10.1609/aaai.v37i6.25875</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref51">
        <label>51</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Baltaji</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Salehi</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Varshney</surname>
              <given-names>LR</given-names>
            </name>
          </person-group>
          <article-title>Efficient model-agnostic multi-group equivariant networks</article-title>
          <source>Trans Mach Learn Res</source>
          <year>2024</year>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://experts.illinois.edu/en/publications/efficient-model-agnostic-multi-group-equivariant-networ/"/>
          </comment>
          <pub-id pub-id-type="doi">10.48550/arXiv.2310.09675</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref52">
        <label>52</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Akerlof</surname>
              <given-names>GA</given-names>
            </name>
            <name name-style="western">
              <surname>Kranton</surname>
              <given-names>RE</given-names>
            </name>
          </person-group>
          <source>Identity Economics: How Our Identities Shape Our Work, Wages, and Well-Being</source>
          <year>2010</year>
          <publisher-loc>New Jersey</publisher-loc>
          <publisher-name>Princeton University Press</publisher-name>
        </nlm-citation>
      </ref>
      <ref id="ref53">
        <label>53</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Gipiškis</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Tsai</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Kurasova</surname>
              <given-names>O</given-names>
            </name>
          </person-group>
          <article-title>Explainable AI (XAI) in image segmentation in medicine, industry, and beyond: a survey</article-title>
          <source>ICT Express</source>
          <year>2024</year>
          <volume>10</volume>
          <issue>6</issue>
          <fpage>1331</fpage>
          <lpage>1354</lpage>
          <pub-id pub-id-type="doi">10.1016/j.icte.2024.09.008</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref54">
        <label>54</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Peng</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Zhu</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Zhou</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Hao</surname>
              <given-names>X</given-names>
            </name>
          </person-group>
          <article-title>A simple machine learning model for the prediction of acute kidney injury following noncardiac surgery in geriatric patients: a prospective cohort study</article-title>
          <source>BMC Geriatr</source>
          <year>2024</year>
          <volume>24</volume>
          <issue>1</issue>
          <fpage>549</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://bmcgeriatr.biomedcentral.com/articles/10.1186/s12877-024-05148-1"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/s12877-024-05148-1</pub-id>
          <pub-id pub-id-type="medline">38918723</pub-id>
          <pub-id pub-id-type="pii">10.1186/s12877-024-05148-1</pub-id>
          <pub-id pub-id-type="pmcid">PMC11197315</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref55">
        <label>55</label>
        <nlm-citation citation-type="web">
          <article-title>"Software as a Medical Device": possible framework for risk categorization and corresponding considerations</article-title>
          <source>International Medical Device Regulators Forum</source>
          <year>2014</year>
          <access-date>2026-06-03</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://imdrf.org/documents/software-medical-device-possible-framework-risk-categorization-and-corresponding-considerations">http://imdrf.org/documents/software-medical-device-possible-framework-risk-categorization-and-corresponding-considerations</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref56">
        <label>56</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>FitzGerald</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Hurst</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Implicit bias in healthcare professionals: a systematic review</article-title>
          <source>BMC Med Ethics</source>
          <year>2017</year>
          <volume>18</volume>
          <issue>1</issue>
          <fpage>19</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://bmcmedethics.biomedcentral.com/articles/10.1186/s12910-017-0179-8"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/s12910-017-0179-8</pub-id>
          <pub-id pub-id-type="medline">28249596</pub-id>
          <pub-id pub-id-type="pii">10.1186/s12910-017-0179-8</pub-id>
          <pub-id pub-id-type="pmcid">PMC5333436</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref57">
        <label>57</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Shiffrin</surname>
              <given-names>RM</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>MD</given-names>
            </name>
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Wagenmakers</surname>
              <given-names>E</given-names>
            </name>
          </person-group>
          <article-title>A survey of model evaluation approaches with a tutorial on hierarchical bayesian methods</article-title>
          <source>Cogn Sci</source>
          <year>2008</year>
          <volume>32</volume>
          <issue>8</issue>
          <fpage>1248</fpage>
          <lpage>1284</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://onlinelibrary.wiley.com/doi/10.1080/03640210802414826"/>
          </comment>
          <pub-id pub-id-type="doi">10.1080/03640210802414826</pub-id>
          <pub-id pub-id-type="medline">21585453</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref58">
        <label>58</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Parida</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Anwar</surname>
              <given-names>SM</given-names>
            </name>
            <name name-style="western">
              <surname>Patel</surname>
              <given-names>MP</given-names>
            </name>
            <name name-style="western">
              <surname>Blom</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Einat</surname>
              <given-names>TT</given-names>
            </name>
            <name name-style="western">
              <surname>Tonetti</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>CAFES: chest X-ray analysis using federated self-supervised learning for pediatric COVID-19 detection</article-title>
          <source>Proc SPIE Int Soc Opt Eng</source>
          <year>2024</year>
          <volume>12927</volume>
          <fpage>129271I</fpage>
          <pub-id pub-id-type="doi">10.1117/12.3008757</pub-id>
          <pub-id pub-id-type="medline">38873338</pub-id>
          <pub-id pub-id-type="pii">129271I</pub-id>
          <pub-id pub-id-type="pmcid">PMC11167651</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref59">
        <label>59</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Roy</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Chakraborty</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Varshney</surname>
              <given-names>L</given-names>
            </name>
          </person-group>
          <article-title>Beyond pooling: matching for robust generalization under data heterogeneity</article-title>
          <source>ArXiv. Preprint posted online on February 6, 2026</source>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/2602.07154"/>
          </comment>
          <pub-id pub-id-type="doi">10.48550/arXiv.2602.07154</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref60">
        <label>60</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Reddy</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Generative AI in healthcare: an implementation science informed translational path on application, integration and governance</article-title>
          <source>Implement Sci</source>
          <year>2024</year>
          <volume>19</volume>
          <issue>1</issue>
          <fpage>27</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://implementationscience.biomedcentral.com/articles/10.1186/s13012-024-01357-9"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/s13012-024-01357-9</pub-id>
          <pub-id pub-id-type="medline">38491544</pub-id>
          <pub-id pub-id-type="pii">10.1186/s13012-024-01357-9</pub-id>
          <pub-id pub-id-type="pmcid">PMC10941464</pub-id>
        </nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
