<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="review-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">J Med Internet Res</journal-id><journal-id journal-id-type="publisher-id">jmir</journal-id><journal-id journal-id-type="index">1</journal-id><journal-title>Journal of Medical Internet Research</journal-title><abbrev-journal-title>J Med Internet Res</abbrev-journal-title><issn pub-type="epub">1438-8871</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v28i1e79187</article-id><article-id pub-id-type="doi">10.2196/79187</article-id><article-categories><subj-group subj-group-type="heading"><subject>Review</subject></subj-group></article-categories><title-group><article-title>Machine Learning Techniques Used for the Identification of Sociodemographic Factors Associated With Cancer: Systematic Literature Review</article-title></title-group><contrib-group><contrib contrib-type="author" corresp="yes" equal-contrib="yes"><name name-style="western"><surname>Gonz&#x00E1;lez-Infante</surname><given-names>Liz</given-names></name><degrees>MHRM</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib><contrib contrib-type="author" equal-contrib="yes"><name name-style="western"><surname>Marquez</surname><given-names>Gaston</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff2">2</xref><xref ref-type="aff" rid="aff3">3</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib><contrib contrib-type="author" equal-contrib="yes"><name name-style="western"><surname>Parra-Soto</surname><given-names>Solange</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff2">2</xref><xref ref-type="aff" rid="aff4">4</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib><contrib contrib-type="author" equal-contrib="yes"><name name-style="western"><surname>Cardona-Valencia</surname><given-names>M&#x00F3;nica</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff5">5</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib><contrib contrib-type="author" equal-contrib="yes"><name name-style="western"><surname>Taramasco</surname><given-names>Carla</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff2">2</xref><xref ref-type="aff" rid="aff6">6</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib></contrib-group><aff id="aff1"><institution>Facultad de Ciencias Empresariales, Universidad del B&#x00ED;o-B&#x00ED;o</institution><addr-line>Andr&#x00E9;s Bello 720</addr-line><addr-line>Chill&#x00E1;n</addr-line><country>Chile</country></aff><aff id="aff2"><institution>Centro para la Prevenci&#x00F3;n y el Control del C&#x00E1;ncer</institution><addr-line>Santiago</addr-line><country>Chile</country></aff><aff id="aff3"><institution>Departamento de Ciencias de la Computaci&#x00F3;n y Tecnolog&#x00ED;as de la Informaci&#x00F3;n, Facultad de Ciencias Empresariales, Universidad del B&#x00ED;o-B&#x00ED;o</institution><addr-line>Chillan</addr-line><country>Chile</country></aff><aff id="aff4"><institution>Departamento de Nutrici&#x00F3;n y Salud P&#x00FA;blica, Facultad Ciencias de la Salud y de los Alimentos, Universidad del B&#x00ED;o-B&#x00ED;o</institution><addr-line>Chill&#x00E1;n</addr-line><country>Chile</country></aff><aff id="aff5"><institution>Departamento Ciencias de la Rehabilitaci&#x00F3;n en Salud, Facultad de Ciencias de la Salud y de los Alimentos, Universidad del B&#x00ED;o-B&#x00ED;o</institution><addr-line>Chill&#x00E1;n</addr-line><country>Chile</country></aff><aff id="aff6"><institution>ITISB, Facultad de Ingenier&#x00ED;a, Universidad Andr&#x00E9;s Bello</institution><addr-line>Vi&#x00F1;a del Mar</addr-line><country>Chile</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Sarvestan</surname><given-names>Javad</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Chow</surname><given-names>James C L</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Shaffi</surname><given-names>Shamnad Mohamed</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Liz Gonz&#x00E1;lez-Infante, MHRM, Facultad de Ciencias Empresariales, Universidad del B&#x00ED;o-B&#x00ED;o, Andr&#x00E9;s Bello 720, Chill&#x00E1;n, Chile, 56 422463324; <email>liz.gonzalez2301@alumnos.ubiobio.cl</email></corresp><fn fn-type="equal" id="equal-contrib1"><label>*</label><p>all authors contributed equally</p></fn></author-notes><pub-date pub-type="collection"><year>2026</year></pub-date><pub-date pub-type="epub"><day>28</day><month>1</month><year>2026</year></pub-date><volume>28</volume><elocation-id>e79187</elocation-id><history><date date-type="received"><day>17</day><month>06</month><year>2025</year></date><date date-type="rev-recd"><day>18</day><month>10</month><year>2025</year></date><date date-type="accepted"><day>30</day><month>10</month><year>2025</year></date></history><copyright-statement>&#x00A9; Liz Gonz&#x00E1;lez-Infante, Gaston Marquez, Solange Parra, M&#x00F3;nica Cardona, Carla Taramasco. Originally published in the Journal of Medical Internet Research (<ext-link ext-link-type="uri" xlink:href="https://www.jmir.org">https://www.jmir.org</ext-link>), 28.1.2026. </copyright-statement><copyright-year>2026</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in the Journal of Medical Internet Research (ISSN 1438-8871), is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://www.jmir.org/">https://www.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://www.jmir.org/2026/1/e79187"/><abstract><sec><title>Background</title><p>Cancer remains one of the foremost global causes of mortality, with nearly 10 million deaths recorded by 2020. As incidence rates rise, there is a growing interest in leveraging machine learning (ML) to enhance prediction, diagnosis, and treatment strategies. Despite these advancements, insufficient attention has been directed toward the integration of sociodemographic variables, which are crucial determinants of health equity, into ML models in oncology.</p></sec><sec><title>Objective</title><p>This review aims to investigate how ML techniques have been used to identify patterns of predictive association between sociodemographic factors and cancer-related outcomes. Specifically, it seeks to map current research endeavors by detailing the types of algorithms used, the sociodemographic variables examined, and the validation methodologies used.</p></sec><sec sec-type="methods"><title>Methods</title><p>We conducted a systematic literature review in accordance with the PRISMA (Preferred Reporting Items for Systematic Reviews and Meta-Analyses) guidelines. Searches were executed across 6 databases, focusing on the primary studies using ML to investigate the association between sociodemographic characteristics and cancer-related outcomes. The search strategy was informed by the PICO (population, intervention, comparison, and outcome) framework, and a set of predefined inclusion criteria was used to screen the studies. The methodological quality of each included paper was assessed.</p></sec><sec sec-type="results"><title>Results</title><p>Out of the 328 records examined, 19 satisfied the inclusion criteria. The majority of studies used supervised ML techniques, with random forest and extreme gradient boosting being the most commonly used. Frequently analyzed variables include age, male or female or intersex, education level, income, and geographic location. Cross-validation is the predominant method for evaluating model performance. Nevertheless, the integration of clinical and sociodemographic data is limited, and efforts toward external validation are infrequent.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>ML holds significant potential for discerning patterns associated with the social determinants of cancer. Nevertheless, research in this domain remains fragmented and inconsistent. Future investigations should prioritize the integration of contextual factors, enhance model transparency, and bolster external validation. These measures are crucial for the development of more equitable, generalizable, and actionable ML applications in cancer care.</p></sec></abstract><kwd-group><kwd>cancer</kwd><kwd>health disparities</kwd><kwd>machine learning</kwd><kwd>predictive models</kwd><kwd>social determinants of health</kwd><kwd>sociodemographic factors</kwd><kwd>systematic review</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>The use of machine learning (ML) in oncology has advanced significantly over the past decade, offering new opportunities for early detection, survival prediction, and treatment personalization. Models based on techniques such as random forests (RFs), extreme gradient boosting (XGBoost), and deep neural networks have demonstrated remarkable performance across different types of cancer, fueling enthusiasm for what has been termed digital precision oncology [<xref ref-type="bibr" rid="ref1">1</xref>]. However, most of these applications rely almost exclusively on clinical and biomedical data, limiting their ability to capture the broader social and structural factors that shape health outcomes [<xref ref-type="bibr" rid="ref2">2</xref>]. This gap raises important concerns, as it may compromise both the external validity and the equity of ML models. In this review, we consistently use the term sociodemographic factors to refer to variables such as age, male or female or intersex, educational attainment, income, ethnicity, rurality, and access to health care. These factors conceptually overlap with the broader category of social determinants of health (SDoH), but our focus is on those variables that are typically available in clinical and research datasets and are explicitly integrated into ML models. By doing so, we ensure clarity and terminological consistency throughout the paper.</p><p>Our review focuses on the most common sociodemographic variables in clinical and research datasets, such as age, male or female or intersex, education, income, and others, reflecting the current landscape of published ML studies rather than a deliberate theoretical choice. We recognize that these indicators only capture part of the social gradient influencing cancer outcomes. Therefore, we highlight the importance of future research integrating contextual and multilevel determinants, such as neighborhood characteristics, health care infrastructure, environmental exposures, and political factors, to promote an equity-centered approach to ML applications in oncology.</p><p>In parallel, the rise of explainable artificial intelligence (AI) has highlighted the importance of transparency and interpretability in clinical settings. Tools such as Shapley Additive Explanations and local interpretable model-agnostic explanations allow health care professionals to better understand ML models by identifying which variables are most relevant in predictions and how they interact with both clinical and sociodemographic factors [<xref ref-type="bibr" rid="ref3">3</xref>]. These advances not only strengthen trust in ML-based systems but also enhance their potential for integration into clinical practice and public health policy [<xref ref-type="bibr" rid="ref4">4</xref>]. The convergence of explainable AI and SDoH emerges as a promising pathway toward developing fairer and more actionable models.</p><p>Nevertheless, our review of the literature reveals that although research and reviews on ML in oncology are rapidly expanding, most have concentrated on methodological, genomic, or clinical aspects without adequately addressing sociodemographic factors. This omission limits the ability of the scientific community to develop robust guidelines for implementing models across diverse contexts and health systems. Against this backdrop, this study aimed to identify, characterize, and synthesize primary research that applied ML methods to analyze sociodemographic factors associated with cancer. The objective was to address both methodological and conceptual gaps while contributing to the development of fairer and more transparent models that can inform data-driven public health strategies. We present the results of a systematic literature review (SLR) examining how ML techniques have been used to identify and interpret sociodemographic factors in cancer-related studies. Of the 328 papers screened, 19 (5.8%) met the inclusion criteria. Rather than being a limitation, this number reflects the emerging nature of the field and highlights the value of conducting an early review to consolidate initial progress, make methodological and equity-related gaps more visible, and guide future research toward a stronger integration of sociodemographic factors in ML models applied to oncology.</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Research Questions</title><p>Based on the main objective, we defined the following research questions:</p><list list-type="order"><list-item><p>What ML techniques have been applied in studies that analyze sociodemographic data of patients with cancer to identify factors associated with the disease?</p></list-item><list-item><p>What sociodemographic factors have been consistently identified as relevant to the diagnosis, progression, or treatment of cancer?</p></list-item></list></sec><sec id="s2-2"><title>Identification</title><p>The SLR was conducted in accordance with the PRISMA (Preferred Reporting Items for Systematic Reviews and Meta-Analyses) guidelines (<xref ref-type="supplementary-material" rid="app4">Checklist 1</xref>), which provide a rigorous framework for ensuring transparency and reproducibility in evidence synthesis [<xref ref-type="bibr" rid="ref5">5</xref>]. To guide the construction of the search strategy, we also adopted the PICO (population, intervention, comparison, and outcome) model, as recommended by Petersen et al [<xref ref-type="bibr" rid="ref6">6</xref>]. This framework allowed us to clearly define the target population, specify the type of intervention (ie, application of ML techniques), and focus the outcome on the identification of relevant sociodemographic factors associated with cancer (<xref ref-type="table" rid="table1">Table 1</xref>).</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Keywords used in the PICO (population, intervention, comparison, and outcome) structure.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Component</td><td align="left" valign="bottom">Description</td><td align="left" valign="bottom">Keywords</td></tr></thead><tbody><tr><td align="left" valign="top">Population</td><td align="left" valign="top">Studies analyzing data from patients with cancer that include sociodemographic variables. These may encompass age, male or female or intersex, socioeconomic status, education, and residence among others.</td><td align="left" valign="top">&#x201C;Sociodemographic factors,&#x201D; &#x201C;social determinants,&#x201D; &#x201C;sociodemographic characteristics,&#x201D; and &#x201C;socio-demographic variables&#x201D;</td></tr><tr><td align="left" valign="top">Intervention</td><td align="left" valign="top">Application of machine learning techniques to identify and analyze sociodemographic factors associated with cancer.</td><td align="left" valign="top">&#x201C;Machine learning&#x201D; and &#x201C;artificial intelligence&#x201D;</td></tr><tr><td align="left" valign="top">Comparison</td><td align="left" valign="top">No previous studies with similar scope and objectives were identified as suitable comparators. This review explores a novel approach.</td><td align="left" valign="top">Not applicable</td></tr><tr><td align="left" valign="top">Outcome</td><td align="left" valign="top">Identification of the most relevant sociodemographic variables associated with cancer outcomes, and assessment of the predictive performance of the applied machine learning models.</td><td align="left" valign="top">&#x201C;Cancer,&#x201D; &#x201C;oncology,&#x201D; variable importance, model accuracy, and AUC<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup></td></tr></tbody></table><table-wrap-foot><fn id="table1fn1"><p><sup>a</sup>AUC: area under the curve.</p></fn></table-wrap-foot></table-wrap><p>The search terms were combined using the Boolean operators AND and OR to ensure comprehensive retrieval of relevant literature. The final search string was as follows:</p><p>([&#x201C;sociodemographic factors&#x201D; OR &#x201C;socio-demographic factors&#x201D; OR &#x201C;sociodemographic characteristics&#x201D; OR &#x201C;socio-demographic characteristics&#x201D; OR &#x201C;social determinants&#x201D; OR &#x201C;sociodemographic variables&#x201D; OR &#x201C;socio-demographic variables&#x201D;) AND (&#x201C;machine learning&#x201D; OR &#x201C;artificial intelligence&#x201D;) AND (&#x201C;cancer&#x201D; OR &#x201C;oncology&#x201D;])</p></sec><sec id="s2-3"><title>Screening</title><p>We conducted a comprehensive literature search across 6 major databases: PubMed (n=76), ACM Digital Library (n=85), ScienceDirect (n=7), IEEE Xplore (n=1), Web of Science Core Collection (n=80), and Scopus (n=79). Searches covered the period from database inception to October 14, 2024. PubMed was selected as the primary source for biomedical and oncology research. ScienceDirect was included to capture papers published in Elsevier journals not indexed elsewhere. ACM Digital Library and IEEE Xplore were used to retrieve computer science and engineering studies, where ML methods are often first reported. Web of Science facilitated interdisciplinary retrieval and citation tracking, while Scopus provided broad multidisciplinary coverage.</p><p>All records were exported, merged, and deduplicated prior to screening. To maximize comprehensiveness and minimize selection bias, we also applied forward and backward citation chasing on included studies. Full electronic search strategies for each database are provided in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p></sec><sec id="s2-4"><title>Paper Selection</title><sec id="s2-4-1"><title>Eligibility Criteria</title><p>Primary studies were screened and selected based on predefined inclusion and exclusion criteria. The specific inclusion criteria applied are summarized in <xref ref-type="other" rid="box1">Textbox 1</xref>.</p><boxed-text id="box1"><title> Inclusion and exclusion criteria.</title><p><bold>Inclusion criteria</bold></p><list list-type="bullet"><list-item><p>Type of study: primary studies presenting original data or analysis. Quantitative studies applying machine learning techniques to analyze sociodemographic factors related to cancer, including experimental, observational (cohort, case-control, and cross-sectional), or methodological designs.</p></list-item><list-item><p>Study area: application of machine learning in health, focused on the analysis of sociodemographic factors (eg, age, male or female or intersex, ethnicity, socioeconomic status, and health care access) and their association with any type of cancer (eg, breast, lung, prostate, and gastrointestinal).</p></list-item><list-item><p>Machine learning techniques: use of supervised algorithms (eg, neural networks, decision trees, support vector machines, and logistic regression), unsupervised (eg, clustering), or semisupervised algorithms. Reporting of performance metrics such as accuracy, sensitivity, specificity, and receiver operating characteristic area under the curve.</p></list-item><list-item><p>Sociodemographic factors: explicit analysis of sociodemographic variables related to cancer risk, prevalence, or progression, including age, male or female or intersex, ethnicity, income, education, occupation, geographic location, health care access, and other socioeconomic determinants.</p></list-item><list-item><p>Publication period: studies published from 2014 onward.</p></list-item><list-item><p>Language: publications in English or Spanish.</p></list-item><list-item><p>Accessibility: full-text access or access to essential data and results enabling methodological evaluation.</p></list-item></list><p><bold>Exclusion criteria</bold></p><list list-type="bullet"><list-item><p>Type of study: systematic reviews, narrative reviews, meta-analyses, or secondary studies.</p></list-item><list-item><p>Study area: studies not analyzing the association between sociodemographic factors and cancer. Studies focused on other diseases (eg, diabetes and cardiovascular diseases).</p></list-item><list-item><p>Machine learning techniques: studies relying solely on traditional statistical methods and not reporting model validation metrics.</p></list-item><list-item><p>Sociodemographic factors: studies applying machine learning without including sociodemographic variables (eg, focused only on genetic, molecular, or biological data).</p></list-item><list-item><p>Publication period: Studies published before 2014.</p></list-item><list-item><p>Language: publications in other languages without available translation.</p></list-item><list-item><p>Accessibility: abstracts or conference proceedings without access to the full paper.</p></list-item></list></boxed-text></sec><sec id="s2-4-2"><title>Quality Assessment</title><p>The purpose of the quality assessment was to evaluate the relevance of each selected paper. Although quality assessment did not influence the selection of primary studies [<xref ref-type="bibr" rid="ref7">7</xref>], we included it primarily to reflect the validity of the selected studies. Based on the response to each research question, we scored each paper with 2, 1, or 0 points. We then selected those papers that exceeded the 50% threshold. The studies chosen through this assessment ensure that our conclusions, drawn from the extracted data, are supported by adequately resourced evidence (<xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>).</p></sec><sec id="s2-4-3"><title>Study Selection and Resolution of Discrepancies</title><p>Each paper was independently screened by 2 reviewers according to predefined inclusion and exclusion criteria. Any disagreements regarding eligibility were addressed during consensus meetings, where reviewers jointly discussed the rationale for inclusion or exclusion. When consensus could not be reached, a third author was consulted to make the final decision. This procedure ensured transparency, reproducibility, and rigor throughout the study selection process.</p></sec></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>Overview</title><p>The SLR was conducted in accordance with the PRISMA guidelines, which provide a rigorous framework for ensuring transparency and reproducibility in evidence synthesis (<xref ref-type="fig" rid="figure1">Figure 1</xref>). Following the PRISMA methodology, a total of 15 primary studies published in peer-reviewed journals were identified. An additional 4 papers were included through forward snowballing, yielding a final sample of 19 studies. Among these, 58% (11/19) were conducted in the United States. Iran contributed 21% (4/19), followed by India with 11% (2/19), and South Korea with 5% (1/19). One study (5%) represented a collaborative effort between institutions in China and the United States (<xref ref-type="table" rid="table2">Table 2</xref>). The publication dates of the included studies ranged from 2018 to 2024. No eligible primary studies were found in workshop proceedings or book chapters.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>PRISMA (Preferred Reporting Items for Systematic Reviews and Meta-Analyses) flowchart of the selection of primary studies for the systematic literature review. N/A: not applicable.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v28i1e79187_fig01.png"/></fig><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Distribution of primary studies by country.</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Country</td><td align="left" valign="bottom">Number of studies</td></tr></thead><tbody><tr><td align="left" valign="top">United States</td><td align="left" valign="top">11</td></tr><tr><td align="left" valign="top">Iran</td><td align="left" valign="top">4</td></tr><tr><td align="left" valign="top">India</td><td align="left" valign="top">2</td></tr><tr><td align="left" valign="top">South Korea</td><td align="left" valign="top">1</td></tr><tr><td align="left" valign="top">China-US collaboration</td><td align="left" valign="top">1</td></tr></tbody></table></table-wrap></sec><sec id="s3-2"><title>Machine Learning Algorithms and Validation Strategies Reported</title><p>Across the studies analyzed, consistent patterns emerged in both the selection of ML algorithms and the validation methods used (<xref ref-type="table" rid="table3">Table 3</xref>).</p><table-wrap id="t3" position="float"><label>Table 3.</label><caption><p>Summary of the machine learning algorithms and validation strategies reported across the 19 primary studies. Most studies applied ensemble methods such as random forest (RF) or gradient boosting, frequently combined with cross-validation schemes.</p></caption><table id="table3" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Study ID</td><td align="left" valign="bottom">Algorithms used</td><td align="left" valign="bottom">Validation strategy</td><td align="left" valign="bottom">Reference</td></tr></thead><tbody><tr><td align="left" valign="top">S1</td><td align="left" valign="top">Lasso<sup><xref ref-type="table-fn" rid="table3fn1">a</xref></sup> LR<sup><xref ref-type="table-fn" rid="table3fn2">b</xref></sup>, RF, gradient boosting, DT<sup><xref ref-type="table-fn" rid="table3fn3">c</xref></sup>, SVM<sup><xref ref-type="table-fn" rid="table3fn4">d</xref></sup></td><td align="left" valign="top">5-fold CV<sup><xref ref-type="table-fn" rid="table3fn5">e</xref></sup>, ROC-AUC<sup><xref ref-type="table-fn" rid="table3fn6">f</xref></sup>, accuracy, sensitivity, specificity</td><td align="left" valign="top">[<xref ref-type="bibr" rid="ref8">8</xref>]</td></tr><tr><td align="left" valign="top">S2</td><td align="left" valign="top">XGBoost<sup><xref ref-type="table-fn" rid="table3fn7">g</xref></sup>, LightGBM<sup><xref ref-type="table-fn" rid="table3fn8">h</xref></sup>, CatBoost<sup><xref ref-type="table-fn" rid="table3fn9">i</xref></sup>, RF, AdaBoost, Lasso regression</td><td align="left" valign="top">10-fold CV</td><td align="left" valign="top">[<xref ref-type="bibr" rid="ref9">9</xref>]</td></tr><tr><td align="left" valign="top">S3</td><td align="left" valign="top">DT, RF</td><td align="left" valign="top">10-fold CV</td><td align="left" valign="top">[<xref ref-type="bibr" rid="ref10">10</xref>]</td></tr><tr><td align="left" valign="top">S4</td><td align="left" valign="top">RF, artificial neural networks, bootstrap aggregating CART<sup><xref ref-type="table-fn" rid="table3fn10">j</xref></sup>, XGBoost</td><td align="left" valign="top">10-fold CV</td><td align="left" valign="top">[<xref ref-type="bibr" rid="ref11">11</xref>]</td></tr><tr><td align="left" valign="top">S5</td><td align="left" valign="top">XGBoost</td><td align="left" valign="top">10-fold CV</td><td align="left" valign="top">[<xref ref-type="bibr" rid="ref12">12</xref>]</td></tr><tr><td align="left" valign="top">S6</td><td align="left" valign="top">LightGBM, XGBoost</td><td align="left" valign="top">10-fold CV</td><td align="left" valign="top">[<xref ref-type="bibr" rid="ref13">13</xref>]</td></tr><tr><td align="left" valign="top">S7</td><td align="left" valign="top">RF, Neural networks, LR, XGBoost</td><td align="left" valign="top">CV, AUC, grid search</td><td align="left" valign="top">[<xref ref-type="bibr" rid="ref14">14</xref>]</td></tr><tr><td align="left" valign="top">S8</td><td align="left" valign="top">RF, gradient boosting machine, SVM</td><td align="left" valign="top">5-fold CV, ROC</td><td align="left" valign="top">[<xref ref-type="bibr" rid="ref15">15</xref>]</td></tr><tr><td align="left" valign="top">S9</td><td align="left" valign="top">Radiomics-signature model</td><td align="left" valign="top">No formal validation performed</td><td align="left" valign="top">[<xref ref-type="bibr" rid="ref16">16</xref>]</td></tr><tr><td align="left" valign="top">S10</td><td align="left" valign="top">Multilayer perceptron, SVM, XGBoost</td><td align="left" valign="top">10-fold CV</td><td align="left" valign="top">[<xref ref-type="bibr" rid="ref17">17</xref>]</td></tr><tr><td align="left" valign="top">S11</td><td align="left" valign="top">Max-p-regions, RF, Jenks natural breaks</td><td align="left" valign="top">RF VIMP<sup><xref ref-type="table-fn" rid="table3fn11">k</xref></sup> ranking</td><td align="left" valign="top">[<xref ref-type="bibr" rid="ref18">18</xref>]</td></tr><tr><td align="left" valign="top">S12</td><td align="left" valign="top">CART, RF</td><td align="left" valign="top">Bootstrap sampling</td><td align="left" valign="top">[<xref ref-type="bibr" rid="ref19">19</xref>]</td></tr><tr><td align="left" valign="top">S13</td><td align="left" valign="top">DT, RF, Boruta feature selection</td><td align="left" valign="top">Confusion matrix</td><td align="left" valign="top">[<xref ref-type="bibr" rid="ref20">20</xref>]</td></tr><tr><td align="left" valign="top">S14</td><td align="left" valign="top">Bayesian additive, regression trees</td><td align="left" valign="top">Partial dependence plots, variable inclusion proportion</td><td align="left" valign="top">[<xref ref-type="bibr" rid="ref21">21</xref>]</td></tr><tr><td align="left" valign="top">S15</td><td align="left" valign="top">LR, ridge classifier, SGD<sup><xref ref-type="table-fn" rid="table3fn12">l</xref></sup>classifier, KNN<sup><xref ref-type="table-fn" rid="table3fn13">m</xref></sup>, DT, linear support vector classifier, support vector classifier with radial basis function kernel, Gaussian Na&#x00EF;ve Bayes, AdaBoost classifier, RF, gradient boosting, QDA<sup><xref ref-type="table-fn" rid="table3fn14">n</xref></sup></td><td align="left" valign="top">5-fold CV, LOOCV<sup><xref ref-type="table-fn" rid="table3fn15">o</xref></sup></td><td align="left" valign="top">[<xref ref-type="bibr" rid="ref22">22</xref>]</td></tr><tr><td align="left" valign="top">S16</td><td align="left" valign="top">Semiautomated segmentation + conditional LR</td><td align="left" valign="top">80/20 hold-out CV, ROC-AUC, Youden Index</td><td align="left" valign="top">[<xref ref-type="bibr" rid="ref23">23</xref>]</td></tr><tr><td align="left" valign="top">S17</td><td align="left" valign="top">Random survival forest, Cox proportional hazards</td><td align="left" valign="top">Grid search, C-index<sup><xref ref-type="table-fn" rid="table3fn16">p</xref></sup></td><td align="left" valign="top">[<xref ref-type="bibr" rid="ref24">24</xref>]</td></tr><tr><td align="left" valign="top">S18</td><td align="left" valign="top">RF, SVM, gradient boosting machine</td><td align="left" valign="top">10-fold CV</td><td align="left" valign="top">[<xref ref-type="bibr" rid="ref25">25</xref>]</td></tr><tr><td align="left" valign="top">S19</td><td align="left" valign="top">SVM, DT, naive Bayesian model, and KNN</td><td align="left" valign="top">10-fold CV</td><td align="left" valign="top">[<xref ref-type="bibr" rid="ref26">26</xref>]</td></tr></tbody></table><table-wrap-foot><fn id="table3fn1"><p><sup>a</sup>Lasso: least absolute shrinkage and selection operator.</p></fn><fn id="table3fn2"><p><sup>b</sup>LR: logistic regression.</p></fn><fn id="table3fn3"><p><sup>c</sup>DT: decision tree.</p></fn><fn id="table3fn4"><p><sup>d</sup>SVM: support vector machine.</p></fn><fn id="table3fn5"><p><sup>e</sup>CV: cross-validation.</p></fn><fn id="table3fn6"><p><sup>f</sup>ROC-AUC: receiver operating characteristic area under the curve.</p></fn><fn id="table3fn7"><p><sup>g</sup>XGBoost: extreme gradient boosting.</p></fn><fn id="table3fn8"><p><sup>h</sup>LightGBM: light gradient boosting machine.</p></fn><fn id="table3fn9"><p><sup>i</sup>CatBoost: categorical boosting.</p></fn><fn id="table3fn10"><p><sup>j</sup>CART: classification and regression tree.</p></fn><fn id="table3fn11"><p><sup>k</sup>VIMP: variable importance.</p></fn><fn id="table3fn12"><p><sup>l</sup>SGD: stochastic gradient descent.</p></fn><fn id="table3fn13"><p><sup>m</sup>KNN: <italic>K</italic>-nearest neighbors.</p></fn><fn id="table3fn14"><p><sup>n</sup>QDA: quadratic discriminant analysis.</p></fn><fn id="table3fn15"><p><sup>o</sup>LOOCV: leave-one-out cross-validation.</p></fn><fn id="table3fn16"><p><sup>p</sup>C-index: concordance index.</p></fn></table-wrap-foot></table-wrap><p>This review identified a wide array of ML algorithms applied to the analysis of sociodemographic and clinical data related to cancer. Each method presents distinct advantages and limitations, influencing its suitability depending on the specific research context and analytical goals. The most relevant algorithmic approaches are summarized below.</p><p>Tree-based methods, particularly RF, were the most frequently used, appearing in 13 of the included studies. RF is widely valued for its interpretability, robustness, and ability to process both categorical and continuous variables, making it especially well-suited to heterogeneous datasets.</p><p>Boosting techniques, such as XGBoost and light gradient boosting machine (LightGBM), featured prominently in studies aiming for high predictive accuracy. XGBoost, used in 7 studies, is noted for its computational efficiency and its capacity to manage imbalanced data, while LightGBM is often selected in contexts where large-scale data processing is prioritized.</p><p>A smaller subset of studies used Bayesian additive regression trees, which were particularly useful in modeling uncertainty and capturing complex non-linear associations. These features make Bayesian additive regression trees well-suited for analyzing disparities across ethnic and clinical subgroups.</p><p>Support vector machines (SVM) appeared in 5 studies and are recognized for their ability to handle high-dimensional data and to separate complex classes using nonlinear decision boundaries [<xref ref-type="bibr" rid="ref27">27</xref>]. However, their performance is highly dependent on careful hyperparameter tuning, which can be challenging in the presence of large or noisy datasets [<xref ref-type="bibr" rid="ref27">27</xref>]. Overall, SVM models remain a valuable choice for complex biomedical data when appropriately optimized and validated within diverse clinical contexts.</p><p>Artificial neural networks (ANNs) were applied in select studies and demonstrated strong performance in modeling nonlinear relationships and uncovering hidden patterns in complex datasets [<xref ref-type="bibr" rid="ref28">28</xref>]. Despite their flexibility, the limited interpretability of ANNs often restricts their use in clinical contexts where transparency and explainability are required [<xref ref-type="bibr" rid="ref28">28</xref>]. Their use, therefore, should be accompanied by complementary interpretability frameworks to ensure clinical reliability and trustworthiness.</p><p>Regression-based models, including the least absolute shrinkage and selection operator and ridge regression, were commonly used as baseline models or for feature selection. These methods are appreciated for their simplicity and interpretability, although they may underperform in settings involving nonlinear relationships or intricate interactions between variables [<xref ref-type="bibr" rid="ref29">29</xref>]. Nevertheless, their transparency and ease of implementation make them a critical reference point for benchmarking more advanced ML models in oncology research.</p><p>Some studies also implemented bagged classification and regression tree models and ensemble methods such as stacking, reflecting a methodological interest in combining simplicity with predictive robustness. These strategies reduce model variance and enhance accuracy by integrating multiple base learners.</p><p>Overall, the analysis reveals a strong preference for tree-based algorithms, which offer an optimal balance between accuracy, interpretability, and adaptability to real-world clinical data. However, the choice of algorithm varied according to the nature of the dataset and the specific research objectives. More recent studies have increasingly adopted advanced methods such as boosting and neural networks, which provide enhanced predictive power but require greater expertise for interpretation and implementation.</p></sec><sec id="s3-3"><title>Common Validation Methods</title><p>The reviewed studies showed a strong preference for cross-validation (CV) as the primary strategy to evaluate ML models applied to the identification of sociodemographic factors related to cancer. This approach is widely recognized for its ability to reduce overfitting and enhance the robustness of predictive performance. Several configurations of CV were used across studies, with 10-fold CV being the most commonly used. This method appeared in studies such as Dianati-Nasab et al [<xref ref-type="bibr" rid="ref24">24</xref>], Stabellini et al [<xref ref-type="bibr" rid="ref20">20</xref>], and Afrash et al [<xref ref-type="bibr" rid="ref22">22</xref>], where it facilitated efficient partitioning of data into training and testing subsets, maximizing the use of available datasets.</p><p>In some cases, CV was complemented with repeated sampling to mitigate random variation and reinforce consistency. For instance, Wang et al [<xref ref-type="bibr" rid="ref30">30</xref>] implemented repetitions alongside 10-fold CV to strengthen model reliability. A less frequently used configuration, 5-fold CV, was applied in studies like Kaushik et al [<xref ref-type="bibr" rid="ref11">11</xref>], offering a computationally efficient alternative without substantially compromising model evaluation.</p><p>Several studies further enhanced reliability by incorporating multiple repetitions. A notable example is the work of He et al [<xref ref-type="bibr" rid="ref9">9</xref>], who used 200 repetitions and evaluated model performance using metrics such as the concordance index and variable importance measures to ensure consistency and interpretability.</p><p>The choice of evaluation metrics reflected a balanced interest in both model discrimination and interpretability. The area under the receiver operating characteristic curve was one of the most frequently reported metrics, particularly valued for its ability to quantify discrimination capacity. It was prominently featured in studies such as Dehdar et al [<xref ref-type="bibr" rid="ref19">19</xref>] and Niell et al [<xref ref-type="bibr" rid="ref12">12</xref>]. Additionally, accuracy, sensitivity, and specificity were widely reported, especially in studies such as Galadima et al [<xref ref-type="bibr" rid="ref25">25</xref>] and Lilhore et al [<xref ref-type="bibr" rid="ref14">14</xref>], as they provided a detailed picture of false positive and false negative rates.</p><p>Some researchers adopted tailored interpretability metrics to better understand model behavior. For example, Niu et al [<xref ref-type="bibr" rid="ref15">15</xref>] used variable inclusion proportions and partial dependence plots to explore the relative importance and marginal effect of predictors, offering deeper insights into model mechanisms. Model optimization also played a critical role in the validation process. Techniques such as grid search were frequently used to fine-tune hyperparameters, as observed in the work of Dehdar et al [<xref ref-type="bibr" rid="ref19">19</xref>]. In more specialized contexts, such as radiomics applications, validation using pretrained models was implemented, for example, in Dercle et al [<xref ref-type="bibr" rid="ref21">21</xref>], focusing on metastatic colorectal cancer and highlighting the relevance of domain-specific strategies.</p><p>While most studies ensured strong internal validity, a common limitation was the lack of external validation. Although a few studies used unseen datasets or pretrained models to assess generalizability, the overall scarcity of external validation in heterogeneous populations restricts the broader applicability of findings. This underscores the importance of expanding validation practices to include more diverse datasets and real-world scenarios.</p></sec><sec id="s3-4"><title>Analysis of Sociodemographic Variables</title><p>The reviewed studies demonstrate considerable variability in the types of sociodemographic variables incorporated into oncology research using ML techniques. Individual-level factors, such as age and male or female or intersex, were the most frequently included, underscoring their foundational role in the development and prognosis of various cancer types. For example, in breast cancer research, variables such as age at diagnosis and hormonal status appear consistently, as noted in the studies by Dianati-Nasab et al [<xref ref-type="bibr" rid="ref24">24</xref>] and Niell et al [<xref ref-type="bibr" rid="ref12">12</xref>]. Similarly, race and ethnicity were widely explored in studies addressing lung and colorectal cancer [<xref ref-type="bibr" rid="ref9">9</xref>], highlighting disparities in health outcomes associated with these variables.</p><p>In addition to individual characteristics, several studies incorporated socioeconomic and access-related factors, which reflect broader SDH. Educational attainment and household income, often used as proxies for access to health resources and health-seeking behavior, featured prominently in studies on colorectal cancer [<xref ref-type="bibr" rid="ref13">13</xref>] and advanced-stage breast cancer [<xref ref-type="bibr" rid="ref13">13</xref>]. Other key access variables, such as transportation availability and type of health insurance, were also frequently considered to assess barriers to diagnosis and treatment, as shown in the works of Wang et al [<xref ref-type="bibr" rid="ref30">30</xref>] and Afrash et al [<xref ref-type="bibr" rid="ref22">22</xref>].</p><p>Some studies expanded their scope to include community- and environment-level variables, though these remain underrepresented overall. Galadima et al [<xref ref-type="bibr" rid="ref25">25</xref>], for instance, investigated aspects of the built environment, such as crime rates and housing values, and their association with late-stage colorectal cancer diagnoses. Similarly, Dehdar et al [<xref ref-type="bibr" rid="ref19">19</xref>] examined the influence of residence location, urban versus rural, on access to medical services, illustrating geographic disparities in health care delivery.</p><p>Regarding cancer types, breast cancer was the most frequently studied, followed by colorectal, lung, and gastric cancer. Research on breast cancer often focuses on the impact of delayed diagnosis and racial disparities, as seen in studies by Stabellini et al [<xref ref-type="bibr" rid="ref20">20</xref>]. In contrast, studies on colorectal cancer emphasized socioeconomic factors and health care access, particularly in relation to late-stage detection [<xref ref-type="bibr" rid="ref13">13</xref>,<xref ref-type="bibr" rid="ref25">25</xref>]. Lung cancer studies primarily explored racial disparities and quality-of-life indicators in survival prediction [<xref ref-type="bibr" rid="ref9">9</xref>,<xref ref-type="bibr" rid="ref10">10</xref>].</p><p>A few studies adopted a broader, multicancer approach, examining sociodemographic patterns across different tumor types. For example, Stabellini et al [<xref ref-type="bibr" rid="ref17">17</xref>] analyzed unplanned hospital readmissions in patients with solid tumors, integrating sociodemographic variables that have a direct influence on health outcomes. To provide a visual synthesis of these findings, <xref ref-type="fig" rid="figure2">Figure 2</xref> presents a summary linking the ML algorithms used with the most frequently analyzed sociodemographic variables.</p><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>Association between machine learning techniques and sociodemographic variables. ANN: artificial neural network; BT: Bayesian tree; DT: decision tree; LASSO: least absolute shrinkage and selection operator; LightGBM: light gradient boosting machine; LR: logistic regression; RF: random forest; SVM: support vector machine; XGBoost: extreme gradient boosting.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v28i1e79187_fig02.png"/></fig></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Stratification of Findings</title><p>The reviewed studies confirm the potential of ML to identify patterns of predictive relevance of sociodemographic variables in relation to oncologic outcomes. However, the evidence remains fragmented and heterogeneous, with limited integration of contextual factors, reliance on predominantly internal validation, and little standardization in the reporting of performance and fairness. Overall, the findings suggest that ML can enhance risk stratification and the detection of disparities, but its real impact depends on methodological decisions that currently remain inconsistent.</p><p>In breast cancer, models most often prioritize age, race or ethnicity, and socioeconomic proxies to explain adverse events and late diagnosis. In colorectal cancer, income, insurance coverage, and geographic location are central for predicting advanced stage and survival. In lung cancer, studies more frequently explore ethnic disparities and quality-of-life measures associated with prognosis. This diversity suggests that the relevant set of SDoH is tumor-specific and linked to each care pathway.</p><p>Retrospective studies dominate; while they provide volume and feasibility, they limit causal inference and the ability to adapt to temporal social changes (eg, economic shocks, migration, or health system reforms). Prospective and longitudinal cohort designs would better capture the temporal variability of SDoH.</p><p>Greater interpretative weight should be placed on studies with stronger control of confounding, explicit handling of missing data, subgroup analyses, and when available, external validation. In contrast, studies with incomplete reporting of variables and opaque pipelines should be viewed as exploratory signals rather than evidence ready for implementation.</p></sec><sec id="s4-2"><title>Linking Inequities and ML Limitations</title><p>When sociodemographic factors are omitted or inconsistently defined, ML models often end up reflecting pre-existing inequities in access to and quality of care instead of uncovering or addressing them. This reflection of structural disparities undermines both the external validity and the generalizability of predictive models [<xref ref-type="bibr" rid="ref31">31</xref>,<xref ref-type="bibr" rid="ref32">32</xref>]. Evidence from recent reviews indicates that algorithmic bias in health care typically emerges from unbalanced data representation and the absence of systematic fairness assessments, highlighting the importance of transparency and interpretability in model design [<xref ref-type="bibr" rid="ref33">33</xref>,<xref ref-type="bibr" rid="ref34">34</xref>]. Although variable-importance analyses can reveal which sociodemographic features most influence predictions, they fall short of explaining underlying causal mechanisms. As Prosperi et al [<xref ref-type="bibr" rid="ref35">35</xref>] and McCradden et al [<xref ref-type="bibr" rid="ref36">36</xref>] emphasize, achieving fairness and accountability in ML-driven health applications requires methodological and ethical frameworks that move beyond conventional supervised learning. For this reason, throughout this review, the term &#x201C;associated factors&#x201D; is used exclusively in a predictive, not causal, sense.</p><p>To advance the field, it is essential to standardize the reporting of sociodemographic variables including age, male or female or intersex, race or ethnicity, education, income, rurality, and health insurance as a minimum dataset to reduce heterogeneity and enable comparability across studies. Fairness metrics, such as demographic parity, equal opportunity, and subgroup calibration, should be applied alongside conventional measures like area under the curve and accuracy to explicitly assess model performance in vulnerable populations. Routine multicenter external validation is needed, testing models across diverse geographical and socioeconomic contexts. Incorporating neighborhood-level data (eg, area-level socioeconomic indices, transportation access, and housing conditions) can provide valuable context for individual predictors. Interdisciplinary collaboration between data scientists, oncologists, public health practitioners, and experts in social science and policy should be promoted to ensure that models achieve both technical precision and equity. Finally, transparent dissemination, including open-source code and model cards documenting limitations, is crucial to strengthen reproducibility and accountability.</p></sec><sec id="s4-3"><title>Principal Findings</title><p>This systematic review synthesized evidence from 19 primary studies published between 2018 and 2024 that applied ML techniques to analyze sociodemographic factors associated with cancer. The analysis revealed consistent methodological patterns, frequently used variables, and prevalent validation strategies, while also identifying key implications for both academic research and professional practice.</p><p>From a methodological perspective, there was a strong preference for tree-based algorithms, particularly RF, which was the most frequently used due to its capacity to manage heterogeneous datasets while preserving a degree of interpretability. Boosting methods, notably XGBoost and LightGBM, were also prominent, especially in studies aiming for high predictive accuracy in high-dimensional or imbalanced data contexts. Less frequently, SVMs and ANNs were used to capture complex, nonlinear relationships, typically in specialized modeling scenarios. Regression-based approaches such as the least absolute shrinkage and selection operator and Ridge regression were primarily used for feature selection or as baseline models for comparative purposes.</p><p>Across the studies, a consistent set of core sociodemographic variables was identified. The most commonly included were age, male or female or intersex, educational level, income, ethnicity, and geographic location. These factors were primarily used to predict diagnostic timelines, disparities in access to treatment, and survival outcomes. However, only a limited number of studies incorporated broader structural or contextual variables&#x2014;such as neighborhood characteristics, transportation access, or housing conditions&#x2014;that could enrich model performance by capturing deeper dimensions of health inequity.</p><p>In terms of validation strategies, 10-fold CV was the most frequently implemented, followed by 5-fold validation in settings with limited computational resources. Most studies relied on standard evaluation metrics such as accuracy, area under the receiver operating characteristic curve, and sensitivity or specificity, reflecting a predominant focus on internal performance. However, the use of external validation with independent datasets was rare, limiting the generalizability of findings to broader, more diverse populations and real-world clinical environments.</p><p>From an applied perspective, the findings suggest that ML holds significant promise for identifying and quantifying structural health disparities in oncology. For the academic research community, this review highlights the importance of developing models that explicitly integrate SDoH, moving beyond individual-level data to encompass contextual and systemic influences. For clinicians and policymakers, predictive models incorporating sociodemographic factors offer a valuable complement to traditional clinical assessments, enabling the early identification of at-risk populations who might otherwise be overlooked.</p><p>Taken together, these findings underscore the transformative potential of ML when applied with methodological rigor, interpretability, and an explicit commitment to equity. Advancing this field will require not only continued technical innovation, but also interdisciplinary collaboration and a deliberate focus on addressing the social and structural dimensions of cancer prevention, diagnosis, and care.</p></sec><sec id="s4-4"><title>Limitations</title><p>We critically assessed potential threats to the validity of our SLR based on the Wohlin classification, which provides clear guidelines for identifying and mitigating such threats [<xref ref-type="bibr" rid="ref37">37</xref>].</p><p>Internal validity threats involve factors that could influence the reliability and accuracy of our study outcomes. A primary concern is selection bias, potentially stemming from limitations inherent in our search strategy and inclusion criteria. To minimize this risk, we carefully defined explicit and rigorous inclusion and exclusion criteria, conducting systematic searches across multiple reputable academic databases. Despite these measures, the relatively small final sample size (N=19) remains a limitation. To further reinforce internal validity, we conducted independent cross-checking and reviews with three domain experts, ensuring consistency and reliability in the selection and evaluation of studies.</p><p>External validity threats refer to the generalizability of our findings beyond the specific studies reviewed. A significant concern here is the representativeness of the primary studies regarding the broader application of ML to sociodemographic determinants of cancer. To mitigate this threat, we engaged external experts in data science and public health to provide critical insights and feedback on our findings, enhancing the relevance and applicability across different contexts [<xref ref-type="bibr" rid="ref7">7</xref>].</p><p>Finally, construct validity threats pertain to the accurate interpretation and generalization of results in alignment with the study objectives. The primary concern here is potential subjectivity or bias in interpreting the findings. To address this, external collaborators participated in the analysis and classification phases, providing independent perspectives that strengthened the robustness and objectivity of our conclusions.</p></sec><sec id="s4-5"><title>Comparison With Prior Work</title><p>Several systematic reviews have examined the application of ML techniques in oncology, but their scope differs significantly from this study. Adeoye et al [<xref ref-type="bibr" rid="ref38">38</xref>] evaluated ML models in oncology settings with limited resources, identifying gaps in external validation and clinical adoption, but without providing a detailed analysis of sociodemographic variables. Hossain Raju et al [<xref ref-type="bibr" rid="ref26">26</xref>] reviewed the use of deep learning for breast cancer risk prediction, focusing mainly on imaging and genomic data. Kumar et al [<xref ref-type="bibr" rid="ref39">39</xref>] offered a broad overview of AI in oncology, emphasizing technical innovation rather than social determinants. Zeinali et al [<xref ref-type="bibr" rid="ref40">40</xref>] analyzed the application of ML in predicting cancer-related symptoms, again with a focus on clinical variables.</p><p>In addition, recent editorials and reviews have highlighted the need to move toward more interpretable and explainable models. For example, Hrinivich et al [<xref ref-type="bibr" rid="ref4">4</xref>] warned about the risks associated with the lack of interpretability in ML models in oncology, noting that reliance on opaque systems may amplify biases and weaken clinical trust. However, while these works underscore the importance of technical transparency, they do not systematically address the incorporation of sociodemographic factors into predictive cancer models.</p><p>Our review differs from previous contributions in three main ways. First, we provide a systematic synthesis of primary studies in which sociodemographic factors are explicitly integrated into ML models applied to oncological outcomes, thereby moving beyond an exclusively clinical or technical lens. Second, we critically assess methodological limitations&#x2014;such as the lack of external validation, limited interpretability, and absence of fairness metrics&#x2014;specifically in relation to the inclusion of sociodemographic data. Third, we connect these findings to broader discussions of equity and public health, emphasizing that neglecting social determinants may inadvertently reinforce inequalities in cancer care. By placing sociodemographic factors at the center rather than at the periphery, this review addresses an underexplored yet essential dimension of the field.</p><p>Ultimately, our findings contribute meaningfully to the growing body of literature by illustrating how ML can be leveraged to deepen our understanding of social inequalities in cancer outcomes. Rather than treating sociodemographic variables as peripheral, this study brings them to the forefront of analysis, offering a more nuanced view of how structural and contextual factors shape cancer risk, access to care, and treatment outcomes. These insights can help guide the development of more inclusive health policies and inform interventions that are responsive to the realities of diverse and historically underserved populations.</p></sec><sec id="s4-6"><title>Conclusions</title><p>This review indicates that the integration of sociodemographic factors into ML models for oncology is still an emerging field, with a modest evidence base that appears to be steadily growing. Only 19 primary studies met our inclusion criteria, yet their collective findings point to the potential benefits of embedding these variables within predictive frameworks. There is some evidence to suggest that explicitly accounting for sociodemographic factors could refine predictive accuracy and fairness, although these associations remain noncausal. That said, such conclusions remain tentative, as further research is needed to substantiate these observations. Looking ahead, researchers might prioritize enhancing the transparency of these models, exploring fairness metrics, and considering how such tools align with the broader goals of health policy. Advancing these aspects could prove vital in ensuring that ML supports both precision oncology and equitable public health outcomes. It is worth noting that, although the variables examined in this review are those most frequently reported in existing datasets, future research could benefit from incorporating contextual and structural determinants to strengthen both fairness and interpretability in ML-based cancer studies (<xref ref-type="supplementary-material" rid="app2">Multimedia Appendices 2</xref> and <xref ref-type="supplementary-material" rid="app3">3</xref>).</p></sec></sec></body><back><ack><p>This study was conducted as part of the Doctoral Program in Economics and Information Management at the Universidad del B&#x00ED;o-B&#x00ED;o. The authors also acknowledge the support of the Center for Cancer Control and Prevention. Additionally, all the authors thank the &#x00D1;uble Health Hub of the Universidad del B&#x00ED;o-B&#x00ED;o for their valuable support. The authors did not use generative artificial intelligence technologies (such as ChatGPT or similar tools) in the generation of this manuscript.</p></ack><notes><sec><title>Funding</title><p>This research was supported by the Agencia Nacional de Investigaci&#x00F3;n y Desarrollo through the Fondo de Financiamiento de Centros de Investigaci&#x00F3;n en &#x00C1;reas Prioritarias (grant 152220002).</p></sec></notes><fn-group><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">AI</term><def><p>artificial intelligence</p></def></def-item><def-item><term id="abb2">ANN</term><def><p>artificial neural network</p></def></def-item><def-item><term id="abb3">LightGBM</term><def><p>light gradient boosting machine</p></def></def-item><def-item><term id="abb4">ML</term><def><p>machine learning</p></def></def-item><def-item><term id="abb5">PICO</term><def><p>population, intervention, comparison, and outcome</p></def></def-item><def-item><term id="abb6">PRISMA</term><def><p>Preferred Reporting Items for Systematic Reviews and Meta-Analyses</p></def></def-item><def-item><term id="abb7">RF</term><def><p>random forest</p></def></def-item><def-item><term id="abb8">SDoH</term><def><p>social determinants of health</p></def></def-item><def-item><term id="abb9">SLR</term><def><p>systematic literature review</p></def></def-item><def-item><term id="abb10">SVM</term><def><p>support vector machine</p></def></def-item><def-item><term id="abb11">XGBoost</term><def><p>extreme gradient boosting</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zhang</surname><given-names>J</given-names> </name><name name-style="western"><surname>Lu</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>N</given-names> </name><etal/></person-group><article-title>Global burden of female breast cancer and its association with socioeconomic development status, 1990-2044</article-title><source>Cancer Rep (Hoboken)</source><year>2023</year><month>09</month><volume>6</volume><issue>Suppl 1</issue><fpage>e1827</fpage><pub-id pub-id-type="doi">10.1002/cnr2.1827</pub-id><pub-id pub-id-type="medline">37095062</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Fountzilas</surname><given-names>E</given-names> </name><name name-style="western"><surname>Pearce</surname><given-names>T</given-names> </name><name name-style="western"><surname>Baysal</surname><given-names>MA</given-names> </name><name name-style="western"><surname>Chakraborty</surname><given-names>A</given-names> </name><name name-style="western"><surname>Tsimberidou</surname><given-names>AM</given-names> </name></person-group><article-title>Convergence of evolving artificial intelligence and machine learning techniques in precision oncology</article-title><source>NPJ Digit Med</source><year>2025</year><month>01</month><day>31</day><volume>8</volume><issue>1</issue><fpage>75</fpage><pub-id pub-id-type="doi">10.1038/s41746-025-01471-y</pub-id><pub-id pub-id-type="medline">39890986</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Alelyani</surname><given-names>T</given-names> </name><name name-style="western"><surname>Alshammari</surname><given-names>MM</given-names> </name><name name-style="western"><surname>Almuhanna</surname><given-names>A</given-names> </name><name name-style="western"><surname>Asan</surname><given-names>O</given-names> </name></person-group><article-title>Explainable artificial intelligence in quantifying breast cancer factors: Saudi Arabia context</article-title><source>Healthcare (Basel)</source><year>2024</year><month>05</month><day>15</day><volume>12</volume><issue>10</issue><fpage>1025</fpage><pub-id pub-id-type="doi">10.3390/healthcare12101025</pub-id><pub-id pub-id-type="medline">38786433</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hrinivich</surname><given-names>WT</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>T</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>C</given-names> </name></person-group><article-title>Editorial: Interpretable and explainable machine learning models in oncology</article-title><source>Front Oncol</source><year>2023</year><volume>13</volume><fpage>1184428</fpage><pub-id pub-id-type="doi">10.3389/fonc.2023.1184428</pub-id><pub-id pub-id-type="medline">37035194</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Moher</surname><given-names>D</given-names> </name><name name-style="western"><surname>Liberati</surname><given-names>A</given-names> </name><name name-style="western"><surname>Tetzlaff</surname><given-names>J</given-names> </name><name name-style="western"><surname>Altman</surname><given-names>DG</given-names> </name><collab>PRISMA Group</collab></person-group><article-title>Preferred Reporting Items for Systematic Reviews and Meta-Analyses: the PRISMA statement</article-title><source>PLoS Med</source><year>2009</year><month>07</month><day>21</day><volume>6</volume><issue>7</issue><fpage>e1000097</fpage><pub-id pub-id-type="doi">10.1371/journal.pmed.1000097</pub-id><pub-id pub-id-type="medline">19621072</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Petersen</surname><given-names>K</given-names> </name><name name-style="western"><surname>Vakkalanka</surname><given-names>S</given-names> </name><name name-style="western"><surname>Kuzniarz</surname><given-names>L</given-names> </name></person-group><article-title>Guidelines for conducting systematic mapping studies in software engineering: an update</article-title><source>Inf Softw Technol</source><year>2015</year><month>08</month><volume>64</volume><fpage>1</fpage><lpage>18</lpage><pub-id pub-id-type="doi">10.1016/j.infsof.2015.03.007</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="report"><person-group person-group-type="author"><name name-style="western"><surname>Kitchenham</surname><given-names>B</given-names> </name></person-group><article-title>Procedures for performing systematic reviews (technical report TR/SE-0401)</article-title><year>2004</year><access-date>2026-01-20</access-date><publisher-name>Keele University</publisher-name><comment><ext-link ext-link-type="uri" xlink:href="https://www.inf.ufsc.br/~aldo.vw/kitchenham.pdf">https://www.inf.ufsc.br/~aldo.vw/kitchenham.pdf</ext-link></comment></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Mortezagholi</surname><given-names>A</given-names> </name><name name-style="western"><surname>Khosravizadeh</surname><given-names>O</given-names> </name><name name-style="western"><surname>Menhaj</surname><given-names>MB</given-names> </name><name name-style="western"><surname>Shafigh</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Kalhor</surname><given-names>R</given-names> </name></person-group><article-title>Make intelligent of gastric cancer diagnosis error in Qazvin&#x2019;s medical centers: using data mining method</article-title><source>Asian Pac J Cancer Prev</source><year>2019</year><month>09</month><day>1</day><volume>20</volume><issue>9</issue><fpage>2607</fpage><lpage>2610</lpage><pub-id pub-id-type="doi">10.31557/APJCP.2019.20.9.2607</pub-id><pub-id pub-id-type="medline">31554353</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>He</surname><given-names>J</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>JX</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>CT</given-names> </name><etal/></person-group><article-title>The relative importance of clinical and socio-demographic variables in prognostic prediction in non-small cell lung cancer: a variable importance approach</article-title><source>Med Care</source><year>2020</year><month>05</month><volume>58</volume><issue>5</issue><fpage>461</fpage><lpage>467</lpage><pub-id pub-id-type="doi">10.1097/MLR.0000000000001288</pub-id><pub-id pub-id-type="medline">31985586</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sim</surname><given-names>JA</given-names> </name><name name-style="western"><surname>Kim</surname><given-names>YA</given-names> </name><name name-style="western"><surname>Kim</surname><given-names>JH</given-names> </name><etal/></person-group><article-title>The major effects of health-related quality of life on 5-year survival prediction among lung cancer survivors: applications of machine learning</article-title><source>Sci Rep</source><year>2020</year><month>07</month><day>1</day><volume>10</volume><issue>1</issue><fpage>10693</fpage><pub-id pub-id-type="doi">10.1038/s41598-020-67604-3</pub-id><pub-id pub-id-type="medline">32612283</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kaushik</surname><given-names>M</given-names> </name><name name-style="western"><surname>Joshi</surname><given-names>RC</given-names> </name><name name-style="western"><surname>Kushwah</surname><given-names>AS</given-names> </name><etal/></person-group><article-title>Cytokine gene variants and socio-demographic characteristics as predictors of cervical cancer: a machine learning approach</article-title><source>Comput Biol Med</source><year>2021</year><month>07</month><volume>134</volume><fpage>104559</fpage><pub-id pub-id-type="doi">10.1016/j.compbiomed.2021.104559</pub-id><pub-id pub-id-type="medline">34147008</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Niell</surname><given-names>BL</given-names> </name><name name-style="western"><surname>Abdalah</surname><given-names>M</given-names> </name><name name-style="western"><surname>Stringfield</surname><given-names>O</given-names> </name><etal/></person-group><article-title>Quantitative measures of background parenchymal enhancement predict breast cancer risk</article-title><source>AJR Am J Roentgenol</source><year>2021</year><month>07</month><volume>217</volume><issue>1</issue><fpage>64</fpage><lpage>75</lpage><pub-id pub-id-type="doi">10.2214/AJR.20.23804</pub-id><pub-id pub-id-type="medline">32876474</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Dong</surname><given-names>W</given-names> </name><name name-style="western"><surname>Bensken</surname><given-names>WP</given-names> </name><name name-style="western"><surname>Kim</surname><given-names>U</given-names> </name><name name-style="western"><surname>Rose</surname><given-names>J</given-names> </name><name name-style="western"><surname>Berger</surname><given-names>NA</given-names> </name><name name-style="western"><surname>Koroukian</surname><given-names>SM</given-names> </name></person-group><article-title>Phenotype discovery and geographic disparities of late-stage breast cancer diagnosis across U.S. counties: a machine learning approach</article-title><source>Cancer Epidemiol Biomarkers Prev</source><year>2022</year><month>01</month><volume>31</volume><issue>1</issue><fpage>66</fpage><lpage>76</lpage><pub-id pub-id-type="doi">10.1158/1055-9965.EPI-21-0838</pub-id><pub-id pub-id-type="medline">34697059</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lilhore</surname><given-names>UK</given-names> </name><name name-style="western"><surname>Poongodi</surname><given-names>M</given-names> </name><name name-style="western"><surname>Kaur</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Hybrid model for detection of cervical cancer using causal analysis and machine learning techniques</article-title><source>Comput Math Methods Med</source><year>2022</year><volume>2022</volume><fpage>4688327</fpage><pub-id pub-id-type="doi">10.1155/2022/4688327</pub-id><pub-id pub-id-type="medline">35572826</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Niu</surname><given-names>L</given-names> </name><name name-style="western"><surname>Hu</surname><given-names>L</given-names> </name><name name-style="western"><surname>Li</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>B</given-names> </name></person-group><article-title>Correlates of cancer prevalence across census tracts in the United States: a Bayesian machine learning approach</article-title><source>Spat Spatiotemporal Epidemiol</source><year>2022</year><month>08</month><volume>42</volume><fpage>100522</fpage><pub-id pub-id-type="doi">10.1016/j.sste.2022.100522</pub-id><pub-id pub-id-type="medline">35934328</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Stabellini</surname><given-names>N</given-names> </name><name name-style="western"><surname>Dmukauskas</surname><given-names>M</given-names> </name><name name-style="western"><surname>Bittencourt</surname><given-names>MS</given-names> </name><etal/></person-group><article-title>Social determinants of health and racial disparities in cardiac events in breast cancer</article-title><source>J Natl Compr Canc Netw</source><year>2023</year><month>07</month><volume>21</volume><issue>7</issue><fpage>705</fpage><lpage>714</lpage><pub-id pub-id-type="doi">10.6004/jnccn.2023.7023</pub-id><pub-id pub-id-type="medline">37433439</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Stabellini</surname><given-names>N</given-names> </name><name name-style="western"><surname>Nazha</surname><given-names>A</given-names> </name><name name-style="western"><surname>Agrawal</surname><given-names>N</given-names> </name><etal/></person-group><article-title>Thirty-day unplanned hospital readmissions in patients with cancer and the impact of social determinants of health: a machine learning approach</article-title><source>JCO Clin Cancer Inform</source><year>2023</year><month>07</month><volume>7</volume><fpage>e2200143</fpage><pub-id pub-id-type="doi">10.1200/CCI.22.00143</pub-id><pub-id pub-id-type="medline">37463363</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Stone</surname><given-names>A</given-names> </name><name name-style="western"><surname>Kalahiki</surname><given-names>C</given-names> </name><name name-style="western"><surname>Li</surname><given-names>L</given-names> </name><name name-style="western"><surname>Hubig</surname><given-names>N</given-names> </name><name name-style="western"><surname>Iuricich</surname><given-names>F</given-names> </name><name name-style="western"><surname>Dunn</surname><given-names>H</given-names> </name></person-group><article-title>Evaluation of breast tumor morphologies from African American and Caucasian patients</article-title><source>Comput Struct Biotechnol J</source><year>2023</year><volume>21</volume><fpage>3459</fpage><lpage>3465</lpage><pub-id pub-id-type="doi">10.1016/j.csbj.2023.06.019</pub-id><pub-id pub-id-type="medline">38213888</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Dehdar</surname><given-names>S</given-names> </name><name name-style="western"><surname>Salimifard</surname><given-names>K</given-names> </name><name name-style="western"><surname>Mohammadi</surname><given-names>R</given-names> </name><etal/></person-group><article-title>Applications of different machine learning approaches in prediction of breast cancer diagnosis delay</article-title><source>Front Oncol</source><year>2023</year><volume>13</volume><fpage>1103369</fpage><pub-id pub-id-type="doi">10.3389/fonc.2023.1103369</pub-id><pub-id pub-id-type="medline">36874113</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Stabellini</surname><given-names>N</given-names> </name><name name-style="western"><surname>Cullen</surname><given-names>J</given-names> </name><name name-style="western"><surname>Moore</surname><given-names>JX</given-names> </name><etal/></person-group><article-title>Social determinants of health data improve the prediction of cardiac outcomes in females with breast cancer</article-title><source>Cancers (Basel)</source><year>2023</year><month>09</month><day>19</day><volume>15</volume><issue>18</issue><fpage>4630</fpage><pub-id pub-id-type="doi">10.3390/cancers15184630</pub-id><pub-id pub-id-type="medline">37760599</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Dercle</surname><given-names>L</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>M</given-names> </name><name name-style="western"><surname>G&#x00F6;nen</surname><given-names>M</given-names> </name><etal/></person-group><article-title>Ethnic diversity in treatment response for colorectal cancer: proof of concept for radiomics-driven enrichment trials</article-title><source>Eur Radiol</source><year>2023</year><month>12</month><volume>33</volume><issue>12</issue><fpage>9254</fpage><lpage>9261</lpage><pub-id pub-id-type="doi">10.1007/s00330-023-09862-z</pub-id><pub-id pub-id-type="medline">37368111</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Afrash</surname><given-names>MR</given-names> </name><name name-style="western"><surname>Shafiee</surname><given-names>M</given-names> </name><name name-style="western"><surname>Kazemi-Arpanahi</surname><given-names>H</given-names> </name></person-group><article-title>Establishing machine learning models to predict the early risk of gastric cancer based on lifestyle factors</article-title><source>BMC Gastroenterol</source><year>2023</year><month>01</month><day>10</day><volume>23</volume><issue>1</issue><fpage>6</fpage><pub-id pub-id-type="doi">10.1186/s12876-022-02626-x</pub-id><pub-id pub-id-type="medline">36627564</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Dong</surname><given-names>W</given-names> </name><name name-style="western"><surname>Kim</surname><given-names>U</given-names> </name><name name-style="western"><surname>Rose</surname><given-names>J</given-names> </name><etal/></person-group><article-title>Geographic variation and risk factor association of early versus late onset colorectal cancer</article-title><source>Cancers (Basel)</source><year>2023</year><month>02</month><day>4</day><volume>15</volume><issue>4</issue><fpage>1006</fpage><pub-id pub-id-type="doi">10.3390/cancers15041006</pub-id><pub-id pub-id-type="medline">36831350</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Dianati-Nasab</surname><given-names>M</given-names> </name><name name-style="western"><surname>Salimifard</surname><given-names>K</given-names> </name><name name-style="western"><surname>Mohammadi</surname><given-names>R</given-names> </name><etal/></person-group><article-title>Machine learning algorithms to uncover risk factors of breast cancer: insights from a large case-control study</article-title><source>Front Oncol</source><year>2024</year><volume>13</volume><fpage>1276232</fpage><pub-id pub-id-type="doi">10.3389/fonc.2023.1276232</pub-id><pub-id pub-id-type="medline">38425674</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Galadima</surname><given-names>H</given-names> </name><name name-style="western"><surname>Anson-Dwamena</surname><given-names>R</given-names> </name><name name-style="western"><surname>Johnson</surname><given-names>A</given-names> </name><name name-style="western"><surname>Bello</surname><given-names>G</given-names> </name><name name-style="western"><surname>Adunlin</surname><given-names>G</given-names> </name><name name-style="western"><surname>Blando</surname><given-names>J</given-names> </name></person-group><article-title>Machine learning as a tool for early detection: a focus on late-stage colorectal cancer across socioeconomic spectrums</article-title><source>Cancers (Basel)</source><year>2024</year><month>01</month><day>26</day><volume>16</volume><issue>3</issue><fpage>540</fpage><pub-id pub-id-type="doi">10.3390/cancers16030540</pub-id><pub-id pub-id-type="medline">38339293</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Raju</surname><given-names>MAH</given-names> </name><name name-style="western"><surname>Imam</surname><given-names>T</given-names> </name><name name-style="western"><surname>Islam</surname><given-names>J</given-names> </name><name name-style="western"><surname>Al Rakin</surname><given-names>A</given-names> </name><name name-style="western"><surname>Nayyem</surname><given-names>MN</given-names> </name><name name-style="western"><surname>Uddin</surname><given-names>MS</given-names> </name></person-group><article-title>An ontological framework for lung carcinoma prognostication via sophisticated stacking and synthetic minority oversampling techniques</article-title><conf-name>2024 IEEE Asia Pacific Conference on Wireless and Mobile (APWiMob)</conf-name><conf-date>Nov 28-30, 2024</conf-date><conf-loc>Bali, Indonesia</conf-loc><fpage>125</fpage><lpage>130</lpage><pub-id pub-id-type="doi">10.1109/APWiMob64015.2024.10792946</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Hastie</surname><given-names>T</given-names> </name><name name-style="western"><surname>Tibshirani</surname><given-names>R</given-names> </name><name name-style="western"><surname>Friedman</surname><given-names>J</given-names> </name></person-group><source>The Elements of Statistical Learning: Data Mining, Inference, and Prediction</source><year>2009</year><access-date>2025-01-20</access-date><edition>2</edition><publisher-name>Springer</publisher-name><comment><ext-link ext-link-type="uri" xlink:href="https://hastie.su.domains/ElemStatLearn/">https://hastie.su.domains/ElemStatLearn/</ext-link></comment></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Goodfellow</surname><given-names>I</given-names> </name><name name-style="western"><surname>Bengio</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Courville</surname><given-names>A</given-names> </name></person-group><source>Deep Learning</source><year>2016</year><access-date>2026-01-20</access-date><publisher-name>MIT Press</publisher-name><comment><ext-link ext-link-type="uri" xlink:href="https://www.deeplearningbook.org/">https://www.deeplearningbook.org/</ext-link></comment></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zou</surname><given-names>H</given-names> </name><name name-style="western"><surname>Hastie</surname><given-names>T</given-names> </name></person-group><article-title>Regularization and variable selection via the elastic net</article-title><source>J R Stat Soc Series B Stat Methodol</source><year>2005</year><month>04</month><day>1</day><volume>67</volume><issue>2</issue><fpage>301</fpage><lpage>320</lpage><pub-id pub-id-type="doi">10.1111/j.1467-9868.2005.00503.x</pub-id></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Kim</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Barbosa</surname><given-names>EJM</given-names> </name></person-group><article-title>Demographics and socioeconomic determinants of health predict continued participation in a CT lung cancer screening program</article-title><source>Curr Probl Diagn Radiol</source><year>2024</year><volume>53</volume><issue>5</issue><fpage>552</fpage><lpage>559</lpage><pub-id pub-id-type="doi">10.1067/j.cpradiol.2024.04.004</pub-id><pub-id pub-id-type="medline">38658287</pub-id></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Rajkomar</surname><given-names>A</given-names> </name><name name-style="western"><surname>Hardt</surname><given-names>M</given-names> </name><name name-style="western"><surname>Howell</surname><given-names>MD</given-names> </name><name name-style="western"><surname>Corrado</surname><given-names>G</given-names> </name><name name-style="western"><surname>Chin</surname><given-names>MH</given-names> </name></person-group><article-title>Ensuring fairness in machine learning to advance health equity</article-title><source>Ann Intern Med</source><year>2018</year><month>12</month><day>18</day><volume>169</volume><issue>12</issue><fpage>866</fpage><lpage>872</lpage><pub-id pub-id-type="doi">10.7326/M18-1990</pub-id><pub-id pub-id-type="medline">30508424</pub-id></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Colacci</surname><given-names>M</given-names> </name><name name-style="western"><surname>Huang</surname><given-names>YQ</given-names> </name><name name-style="western"><surname>Postill</surname><given-names>G</given-names> </name><etal/></person-group><article-title>Sociodemographic bias in clinical machine learning models: a scoping review of algorithmic bias instances and mechanisms</article-title><source>J Clin Epidemiol</source><year>2025</year><month>02</month><volume>178</volume><fpage>111606</fpage><pub-id pub-id-type="doi">10.1016/j.jclinepi.2024.111606</pub-id><pub-id pub-id-type="medline">39532254</pub-id></nlm-citation></ref><ref id="ref33"><label>33</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Parikh</surname><given-names>RB</given-names> </name><name name-style="western"><surname>Teeple</surname><given-names>S</given-names> </name><name name-style="western"><surname>Navathe</surname><given-names>AS</given-names> </name></person-group><article-title>Addressing bias in artificial intelligence in health care</article-title><source>JAMA</source><year>2019</year><month>12</month><day>24</day><volume>322</volume><issue>24</issue><fpage>2377</fpage><lpage>2378</lpage><pub-id pub-id-type="doi">10.1001/jama.2019.18058</pub-id><pub-id pub-id-type="medline">31755905</pub-id></nlm-citation></ref><ref id="ref34"><label>34</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ning</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Li</surname><given-names>S</given-names> </name><name name-style="western"><surname>Ng</surname><given-names>YY</given-names> </name><etal/></person-group><article-title>Variable importance analysis with interpretable machine learning for fair risk prediction</article-title><source>PLOS Digit Health</source><year>2024</year><month>07</month><volume>3</volume><issue>7</issue><fpage>e0000542</fpage><pub-id pub-id-type="doi">10.1371/journal.pdig.0000542</pub-id><pub-id pub-id-type="medline">38995879</pub-id></nlm-citation></ref><ref id="ref35"><label>35</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Prosperi</surname><given-names>M</given-names> </name><name name-style="western"><surname>Guo</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Sperrin</surname><given-names>M</given-names> </name><etal/></person-group><article-title>Causal inference and counterfactual prediction in machine learning for actionable healthcare</article-title><source>Nat Mach Intell</source><year>2020</year><volume>2</volume><issue>7</issue><fpage>369</fpage><lpage>375</lpage><pub-id pub-id-type="doi">10.1038/s42256-020-0197-y</pub-id></nlm-citation></ref><ref id="ref36"><label>36</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>McCradden</surname><given-names>MD</given-names> </name><name name-style="western"><surname>Joshi</surname><given-names>S</given-names> </name><name name-style="western"><surname>Mazwi</surname><given-names>M</given-names> </name><name name-style="western"><surname>Anderson</surname><given-names>JA</given-names> </name></person-group><article-title>Ethical limitations of algorithmic fairness solutions in health care machine learning</article-title><source>Lancet Digit Health</source><year>2020</year><month>05</month><volume>2</volume><issue>5</issue><fpage>e221</fpage><lpage>e223</lpage><pub-id pub-id-type="doi">10.1016/S2589-7500(20)30065-0</pub-id><pub-id pub-id-type="medline">33328054</pub-id></nlm-citation></ref><ref id="ref37"><label>37</label><nlm-citation citation-type="report"><person-group person-group-type="author"><name name-style="western"><surname>Kitchenham</surname><given-names>B</given-names> </name><name name-style="western"><surname>Charters</surname><given-names>S</given-names> </name></person-group><article-title>Guidelines for performing systematic literature reviews in software engineering</article-title><year>2007</year><access-date>2025-01-20</access-date><publisher-name>Keele University and University of Durham</publisher-name><comment><ext-link ext-link-type="uri" xlink:href="https://legacyfileshare.elsevier.com/promis_misc/525444systematicreviewsguide.pdf">https://legacyfileshare.elsevier.com/promis_misc/525444systematicreviewsguide.pdf</ext-link></comment></nlm-citation></ref><ref id="ref38"><label>38</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Adeoye</surname><given-names>J</given-names> </name><name name-style="western"><surname>Akinshipo</surname><given-names>A</given-names> </name><name name-style="western"><surname>Koohi-Moghadam</surname><given-names>M</given-names> </name><name name-style="western"><surname>Thomson</surname><given-names>P</given-names> </name><name name-style="western"><surname>Su</surname><given-names>YX</given-names> </name></person-group><article-title>Construction of machine learning-based models for cancer outcomes in low and lower-middle income countries: a scoping review</article-title><source>Front Oncol</source><year>2022</year><volume>12</volume><fpage>976168</fpage><pub-id pub-id-type="doi">10.3389/fonc.2022.976168</pub-id><pub-id pub-id-type="medline">36531037</pub-id></nlm-citation></ref><ref id="ref39"><label>39</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kumar</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Gupta</surname><given-names>S</given-names> </name><name name-style="western"><surname>Singla</surname><given-names>R</given-names> </name><name name-style="western"><surname>Hu</surname><given-names>YC</given-names> </name></person-group><article-title>A systematic review of artificial intelligence techniques in cancer prediction and diagnosis</article-title><source>Arch Comput Methods Eng</source><year>2022</year><volume>29</volume><issue>4</issue><fpage>2043</fpage><lpage>2070</lpage><pub-id pub-id-type="doi">10.1007/s11831-021-09648-w</pub-id><pub-id pub-id-type="medline">34602811</pub-id></nlm-citation></ref><ref id="ref40"><label>40</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zeinali</surname><given-names>N</given-names> </name><name name-style="western"><surname>Youn</surname><given-names>N</given-names> </name><name name-style="western"><surname>Albashayreh</surname><given-names>A</given-names> </name><name name-style="western"><surname>Fan</surname><given-names>W</given-names> </name><name name-style="western"><surname>Gilbertson White</surname><given-names>S</given-names> </name></person-group><article-title>Machine learning approaches to predict symptoms in people with cancer: systematic review</article-title><source>JMIR Cancer</source><year>2024</year><month>03</month><day>19</day><volume>10</volume><fpage>e52322</fpage><pub-id pub-id-type="doi">10.2196/52322</pub-id><pub-id pub-id-type="medline">38502171</pub-id></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Quality assessment criteria and the assignment of scores.</p><media xlink:href="jmir_v28i1e79187_app1.docx" xlink:title="DOCX File, 16 KB"/></supplementary-material><supplementary-material id="app2"><label>Multimedia Appendix 2</label><p>Primary studies by year and publication type.</p><media xlink:href="jmir_v28i1e79187_app2.png" xlink:title="PNG File, 323 KB"/></supplementary-material><supplementary-material id="app3"><label>Multimedia Appendix 3</label><p>Primary studies description.</p><media xlink:href="jmir_v28i1e79187_app3.docx" xlink:title="DOCX File, 20 KB"/></supplementary-material><supplementary-material id="app4"><label>Checklist 1</label><p>PRISMA 2020 checklist.</p><media xlink:href="jmir_v28i1e79187_app4.pdf" xlink:title="PDF File, 133 KB"/></supplementary-material></app-group></back></article>