<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">J Med Internet Res</journal-id><journal-id journal-id-type="publisher-id">jmir</journal-id><journal-id journal-id-type="index">1</journal-id><journal-title>Journal of Medical Internet Research</journal-title><abbrev-journal-title>J Med Internet Res</abbrev-journal-title><issn pub-type="epub">1438-8871</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v28i1e89071</article-id><article-id pub-id-type="doi">10.2196/89071</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Detecting Uncoded Self-Harm in Veterans&#x2019; Electronic Health Records Using Positive and Unlabeled Learning: Retrospective Cohort Study</article-title></title-group><contrib-group><contrib contrib-type="author"><name name-style="western"><surname>Kumar</surname><given-names>Praveen</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Viszolay</surname><given-names>Alexandria D</given-names></name><degrees>BS</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Upadhayaya</surname><given-names>Rajesh</given-names></name><degrees>BS</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Moomtaheen</surname><given-names>Fariha</given-names></name><degrees>MS</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Greer</surname><given-names>Donald R</given-names></name><degrees>BBA</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Bologa</surname><given-names>Cristian G</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Schneider</surname><given-names>Kristan A</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Davis</surname><given-names>Sharon E</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff4">4</xref><xref ref-type="aff" rid="aff5">5</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Matheny</surname><given-names>Michael E</given-names></name><degrees>MD, MS, MPH</degrees><xref ref-type="aff" rid="aff4">4</xref><xref ref-type="aff" rid="aff5">5</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>van der Goes</surname><given-names>David</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff3">3</xref><xref ref-type="aff" rid="aff6">6</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Villarreal</surname><given-names>Gerardo</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff3">3</xref><xref ref-type="aff" rid="aff7">7</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Zhu</surname><given-names>Yiliang</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Tohen</surname><given-names>Mauricio</given-names></name><degrees>MBA, MD, DrPH</degrees><xref ref-type="aff" rid="aff7">7</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Malec</surname><given-names>Scott A</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Yang</surname><given-names>Jeremy J</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Fielstein</surname><given-names>Elliot M</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff4">4</xref><xref ref-type="aff" rid="aff8">8</xref></contrib><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Lambert</surname><given-names>Christophe Gerard</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff3">3</xref></contrib></contrib-group><aff id="aff1"><institution>Department of Internal Medicine, School of Medicine, University of New Mexico Health Sciences Center</institution><addr-line>1 University of New Mexico, MSC10 5550</addr-line><addr-line>Albuquerque</addr-line><addr-line>NM</addr-line><country>United States</country></aff><aff id="aff2"><institution>Greer Black Company</institution><addr-line>Bozeman</addr-line><addr-line>MT</addr-line><country>United States</country></aff><aff id="aff3"><institution>Raymond G. Murphy VA Medical Center</institution><addr-line>Albuquerque</addr-line><addr-line>NM</addr-line><country>United States</country></aff><aff id="aff4"><institution>Department of Biomedical Informatics, Vanderbilt University Medical Center</institution><addr-line>Nashville</addr-line><addr-line>TN</addr-line><country>United States</country></aff><aff id="aff5"><institution>VA Tennessee Valley Healthcare System</institution><addr-line>Nashville</addr-line><addr-line>TN</addr-line><country>United States</country></aff><aff id="aff6"><institution>Department of Economics, University of New Mexico</institution><addr-line>Albuquerque</addr-line><addr-line>NM</addr-line><country>United States</country></aff><aff id="aff7"><institution>Department of Psychiatry and Behavioral Sciences, School of Medicine, University of New Mexico Health Sciences Center</institution><addr-line>Albuquerque</addr-line><addr-line>NM</addr-line><country>United States</country></aff><aff id="aff8"><institution>Office of Mental Health, United States Department of Veterans Affairs</institution><addr-line>Washington</addr-line><addr-line>DC</addr-line><country>United States</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Brini</surname><given-names>Stefano</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Tai</surname><given-names>Andy</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Nelson</surname><given-names>Stuart</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Christophe Gerard Lambert, PhD, Department of Internal Medicine, School of Medicine, University of New Mexico Health Sciences Center, 1 University of New Mexico, MSC10 5550, Albuquerque, NM, 87131, United States, 1 505-272-9709; <email>cglambert@salud.unm.edu</email></corresp></author-notes><pub-date pub-type="collection"><year>2026</year></pub-date><pub-date pub-type="epub"><day>4</day><month>6</month><year>2026</year></pub-date><volume>28</volume><elocation-id>e89071</elocation-id><history><date date-type="received"><day>05</day><month>12</month><year>2025</year></date><date date-type="rev-recd"><day>17</day><month>04</month><year>2026</year></date><date date-type="accepted"><day>19</day><month>04</month><year>2026</year></date></history><copyright-statement>&#x00A9; Praveen Kumar, Alexandria D Viszolay, Rajesh Upadhayaya, Fariha Moomtaheen, Donald R Greer, Cristian G Bologa, Kristan A Schneider, Sharon E Davis, Michael E Matheny, David van der Goes, Gerardo Villarreal, Yiliang Zhu, Mauricio Tohen, Scott A Malec, Jeremy J Yang, Elliot M Fielstein, Christophe Gerard Lambert. Originally published in the Journal of Medical Internet Research (<ext-link ext-link-type="uri" xlink:href="https://www.jmir.org">https://www.jmir.org</ext-link>), 4.6.2026. </copyright-statement><copyright-year>2026</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in the Journal of Medical Internet Research (ISSN 1438-8871), is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://www.jmir.org/">https://www.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://www.jmir.org/2026/1/e89071"/><abstract><sec><title>Background</title><p>Underdiagnosis and undercoding are common across mental health conditions, particularly suicide and self-harm. This leaves health care datasets lacking reliable negative examples needed for predictive modeling, phenotype prevalence estimation, and identification of individuals at elevated risk. We use positive and unlabeled (PU) learning to address this challenge.</p></sec><sec><title>Objective</title><p>This study aims to identify US Veterans whose self-harm events were not explicitly captured through diagnostic codes in electronic health records (EHRs) and estimate the underlying prevalence using a novel PU learning algorithm.</p></sec><sec sec-type="methods"><title>Methods</title><p>We performed a retrospective cohort study using Veterans Health Administration EHRs (from October 1, 1999, to August 31, 2019), selecting a random 25% sample of 1,329,120 Veterans out of 5,316,480 (1,193,563 males and 135,557 females) with at least 2 years of observation. The study cohort comprised 24,625 Veterans with coded self-harm and 1,304,495 uncoded, with the mean ages of 38.39 (SD 12.17) and 48.76 (SD 15.04) years, respectively. We applied our PULSNAR (positive unlabeled learning selected not at random) algorithm to estimate the proportion of individuals with uncoded self-harm. Covariates included age, medical conditions, procedures, and clinical observations. Four experts (raters) independently reviewed charts of 97 uncoded Veterans, each selected from 1% intervals of calibrated PULSNAR probabilities from 0.01 to 0.97. Agreement was assessed among raters, PULSNAR classifications, and consensus review decisions. Post hoc calibration was used to refine prevalence estimates.</p></sec><sec sec-type="results"><title>Results</title><p>Of the 159,049 covariates in the dataset, PULSNAR&#x2019;s Extreme Gradient Boosting (XGBoost) model identified 1302 (0.82%) as informative for classification. Only 1.85% (24,625/1,329,120) of Veterans had diagnostic codes indicating self-harm events, while PULSNAR estimated an overall prevalence of 10.46% (139,026/1,329,120) by identifying an additional <italic>&#x03B1;</italic>=8.77% (114,404/1,304,495) of self-harm cases among the uncoded population. Of the 97 chart-reviewed patients, 39 had documented but uncoded self-harm. PULSNAR probabilities were post hoc calibrated such that their sum over the 97 cases equaled 39, which adjusted the combined coded and imputed prevalence downward from 10.46% to 7.91% (105,133/1,329,120). By applying this calibration to shift the probabilities of all uncoded Veterans, with bootstrapping for confidence intervals, PULSNAR estimates that coded self-harm represents only 23.4% (95% CI 17.76% to 31.51%) of all documented (coded+notes) self-harm.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>Under the &#x201C;selected not at random&#x201D; assumption, PULSNAR provides an innovative and scalable framework for estimating the clinically documented prevalence of mental health conditions and identifying the uncoded individuals with calibrated prediction, without requiring confirmed negative labels. This method offers an alternative to time-consuming chart reviews for detecting likely cases missing structured coding capture. By addressing diagnostic undercoding of mental health conditions in EHRs, this approach has the potential to enhance the estimation of mental health prevalence and support screening, activation of automated clinical decision support, targeted intervention, better resource allocation, and research to improve outcomes in real-world settings.</p></sec></abstract><kwd-group><kwd>self-injurious behavior</kwd><kwd>machine learning</kwd><kwd>Veterans' health</kwd><kwd>PULSNAR</kwd><kwd>electronic health record</kwd><kwd>positive unlabeled learning selected not at random</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>Suicide and self-harm remain significant public health concerns in the United States, consistently ranking among the leading causes of death. In 2023, suicide was among the top 8 causes of death for individuals aged 10&#x2010;64 years and the second leading cause for those aged 10-34 years [<xref ref-type="bibr" rid="ref1">1</xref>,<xref ref-type="bibr" rid="ref2">2</xref>]. According to the Centers for Disease Control and Prevention, suicide accounted for 49,316 deaths in 2023 [<xref ref-type="bibr" rid="ref1">1</xref>], 1.6% of all deaths. The prevalence of suicidal thoughts and attempts is even higher than suicide deaths. In 2023, an estimated 12.8 million American adults aged 18 years or older experienced serious thoughts of suicide, 3.7 million made a suicide plan, and 1.5 million attempted suicide [<xref ref-type="bibr" rid="ref1">1</xref>,<xref ref-type="bibr" rid="ref3">3</xref>]. These data suggest that for each suicide death, there are hundreds of individuals experiencing suicidal ideation or engaging in self-harm.</p><p>Veterans are at a disproportionately higher risk of experiencing trauma and self-destructive behavior compared to the general population, accounting for nearly 14% of adult suicide deaths despite representing only 7.6% of the population [<xref ref-type="bibr" rid="ref4">4</xref>]. The overall unadjusted suicide rate among Veterans was roughly double that of non-Veteran adults (34.7 per 100,000 vs 17.1 per 100,000) [<xref ref-type="bibr" rid="ref5">5</xref>]. Both male (37.3 vs 28.7 per 100,000) and female Veterans (13.5 vs 7.2 per 100,000) had markedly higher suicide rates than their non-Veteran counterparts. In addition, younger Veterans (ages 18&#x2010;34 years) faced the highest risk (47.6 per 100,000)&#x2014;far exceeding that of any other age group [<xref ref-type="bibr" rid="ref5">5</xref>]. Co-occurring conditions, such as posttraumatic stress disorder, depression, bipolar disorder, substance use disorders, traumatic brain injury, and prior self-injurious behavior, significantly contribute to this elevated risk [<xref ref-type="bibr" rid="ref4">4</xref>,<xref ref-type="bibr" rid="ref6">6</xref>].</p><p>Suicide rarely occurs in isolation; it is often preceded by identifiable risk factors, with trauma, past self-harm, and suicidal ideation being among the strongest predictors of future suicide risk [<xref ref-type="bibr" rid="ref7">7</xref>,<xref ref-type="bibr" rid="ref8">8</xref>]. Early identification of individuals exhibiting these behaviors is critical, as timely intervention can significantly reduce the risk of suicide. Studying self-harm, therefore, is essential for comprehensive suicide prevention efforts. Within the Veterans Health Administration (VHA), policy and quality metrics primarily focus on suicidal self-directed violence within the past 12 months, rather than self-harm broadly defined. Although our study centers on ever self-harm as a phenotype, it is important to distinguish this analytic focus from operational VHA surveillance, which prioritizes identification and follow-up of suicidal behaviors in the near-term versus capturing all self-harm behaviors with and without intent.</p><p>The widespread adoption of electronic health records (EHRs) in the United States has generated large repositories of patient health care data, comprising both structured data (eg, demographics, diagnoses, procedures, and prescriptions) and unstructured data (eg, clinical notes, imaging, and pathology reports) [<xref ref-type="bibr" rid="ref9">9</xref>-<xref ref-type="bibr" rid="ref11">11</xref>]. While these data are invaluable for observational studies and health care analytics, they also reveal inconsistencies in data quality across different clinical settings and heterogeneous data structures that span structured codes, free-text notes, and temporal measurements [<xref ref-type="bibr" rid="ref11">11</xref>]. Additionally, these data are affected by missingness due to multiple mechanisms, including the incomplete capture of out-of-network care, underreporting of sensitive conditions, and not documenting or undercoding of clinical diagnoses. Undercoding refers to instances in which events or conditions, such as self-harm behaviors and mental illnesses, are not recorded in structured diagnosis codes in the EHR, leading to physicians overlooking past diagnoses, inaccurate prevalence estimates, and hindering effective intervention strategies. Importantly, undercoding of mental health conditions, including suicidality and self-harm, is common in EHRs, limiting research that relies on accurate reporting of these conditions [<xref ref-type="bibr" rid="ref12">12</xref>-<xref ref-type="bibr" rid="ref15">15</xref>]. VHA&#x2019;s suicide risk surveillance relies heavily on policy-driven, standardized national note templates such as the Suicide Behavior and Overdose Report (SBOR), which capture suicidal and other self-directed violence events in structured clinical documentation. These templates function in parallel to diagnostic coding and are a primary mechanism by which VHA monitors suicide behaviors. Undercoding of <italic>ICD</italic> (<italic>International Classification of Diseases</italic>) self-harm diagnoses reflects partially the extent of VHA suicide risk monitoring efforts.</p><p>The increased availability of EHRs and advancements in machine learning (ML) methodologies have led to increased application of ML techniques to identify and predict instances of self-harm and suicidal ideation using EHRs and insurance claims data [<xref ref-type="bibr" rid="ref12">12</xref>,<xref ref-type="bibr" rid="ref13">13</xref>,<xref ref-type="bibr" rid="ref15">15</xref>-<xref ref-type="bibr" rid="ref22">22</xref>]. Kumar et al [<xref ref-type="bibr" rid="ref12">12</xref>] and Nestsiarovich et al [<xref ref-type="bibr" rid="ref13">13</xref>] used an Extreme Gradient Boosting (XGBoost) [<xref ref-type="bibr" rid="ref23">23</xref>] model on visit-level data to estimate the uncoded self-harm events among individuals with major mental illness. Simon et al [<xref ref-type="bibr" rid="ref16">16</xref>] developed random forest&#x2013;based models aimed at predicting fatal or nonfatal self-harm events within 90 days following a sampled encounter. Simon et al [<xref ref-type="bibr" rid="ref17">17</xref>] used LASSO (least absolute shrinkage and selection operator)&#x2013;based logistic regression models to predict suicide attempts and suicide deaths post-outpatient visits. Rozova et al [<xref ref-type="bibr" rid="ref18">18</xref>] used natural language processing (NLP) supervised learning techniques on free-text triage notes from emergency department (ED) visits to detect self-harm and suicidal ideation among ED patients. Walsh et al [<xref ref-type="bibr" rid="ref19">19</xref>] used random forest and nonregularized logistic regression models on longitudinal clinical data to detect the risk of nonfatal suicide attempts in adolescents. Tsui et al [<xref ref-type="bibr" rid="ref20">20</xref>] leveraged NLP models on clinical notes along with 4 ML techniques, including Na&#x00EF;ve Bayes, LASSO regression, random forest, and an ensemble of extreme gradient boosting, for the prediction of first-time suicide attempts. Su et al [<xref ref-type="bibr" rid="ref21">21</xref>] used LASSO logistic regression to predict suicidal behavior among children and adolescents based on their longitudinal clinical records, identifying both short- and long-term risk factors. Barak-Corren et al [<xref ref-type="bibr" rid="ref22">22</xref>] developed Bayesian models using a retrospective cohort approach to predict future documented suicidal behavior.</p><p>Previous studies used traditional positive-negative ML classifiers to identify/predict instances of self-harm in health records. However, due to undercoding common in mental health data, unlabeled instances contain both positive (diagnosed and undiagnosed) and negative (unaffected) cases, leading to biased classification and predictions. To address this, we applied a novel positive and unlabeled (PU) learning algorithm, PULSNAR (positive unlabeled learning selected not at random) [<xref ref-type="bibr" rid="ref24">24</xref>], to Veterans&#x2019; EHR data to estimate the proportion of Veterans with ever self-harm. Notably, we are not predicting future self-harm, but rather classifying whether patients had experienced self-harm at any point during the study period. PU learning is a semisupervised approach that uses labeled (coded) positive examples and unlabeled (uncoded) examples containing an unknown mixture of positives and negatives [<xref ref-type="bibr" rid="ref24">24</xref>].</p><p>To our knowledge, this is the first study to use PU learning algorithms to estimate the proportion of Veterans ever with self-harm imputing uncoded individuals in Veterans&#x2019; EHR data. By applying PULSNAR, we aim to improve the detection and estimation of self-harm prevalence among US Veterans, demonstrating a framework applicable to detecting other undetected mental health diagnoses. This approach supports earlier screening as well as novel intervention strategies to reduce self-harm and suicide rates in this high-risk population. Our findings are likely to improve awareness of risk factors of self-destructive behaviors among Veterans and highlight the broader use of PU learning in mental health informatics.</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>PU Learning Background</title><p>In many real-world applications, annotating all records can be challenging, expensive, or even impossible due to the volume of data. Often, only positive instances are labeled, leaving a considerable portion of the data unlabeled [<xref ref-type="bibr" rid="ref25">25</xref>]. Notably, an unlabeled instance does not necessarily indicate a negative case because the absence of a diagnosis code does not confirm the absence of a condition. Given that only a fraction of records are labeled, learning from PU data has emerged as an active area of research [<xref ref-type="bibr" rid="ref24">24</xref>-<xref ref-type="bibr" rid="ref32">32</xref>]. The majority of current PU learning methods are based on the &#x201C;selected completely at random&#x201D; (SCAR) assumption [<xref ref-type="bibr" rid="ref25">25</xref>-<xref ref-type="bibr" rid="ref30">30</xref>], which posits that labeled positives are independent and identically distributed (i.i.d.) random samples from the positive distribution, meaning that the probability of an instance being labeled as positive is independent of its attributes [<xref ref-type="bibr" rid="ref26">26</xref>]. However, in real-world applications, this assumption often does not hold due to labeling bias; for example, patients with more severe or specific symptoms are more likely to be labeled with a disease than those with milder or unspecific symptoms.</p><p>Despite the prevalence of real-world non-SCAR PU data, only a few studies have focused on PU learning under the non-SCAR assumption [<xref ref-type="bibr" rid="ref24">24</xref>,<xref ref-type="bibr" rid="ref31">31</xref>,<xref ref-type="bibr" rid="ref32">32</xref>]. In this study, we have applied PULSNAR, a PU algorithm based on the SNAR (selected not at random) assumption. PULSNAR uses a divide-and-conquer approach to cluster SNAR positives into subtypes and estimate the proportion of each subtype among the unlabeled using the PULSCAR (positive unlabeled learning selected completely at random) algorithm. Under the SNAR assumption, the probability that a positive example is labeled depends on its attributes [<xref ref-type="bibr" rid="ref24">24</xref>,<xref ref-type="bibr" rid="ref32">32</xref>], which is more appropriate for health care data for which labeling bias is prevalent.</p></sec><sec id="s2-2"><title>PU Learning Algorithms to Estimate the Proportion of Self-Harm Among Uncoded Individuals</title><p>We found that existing state-of-the-art PU learning methods suffer from scalability issues and fail to execute on large datasets [<xref ref-type="bibr" rid="ref24">24</xref>]. As a result, in earlier work, we developed 2 novel PU learning algorithms aimed at estimating the proportion (<italic>&#x03B1;</italic>) of positive instances among unlabeled examples and subsequently imputing these instances: PULSCAR for when the SCAR assumption holds, and PULSNAR for when it does not. What sets our methods apart from other PU methods is that they not only estimate <italic>&#x03B1;</italic> but also calculate calibrated probabilities using the estimated <italic>&#x03B1;</italic>, leading to markedly improved classification performance. We focus our analysis on the application of the PULSNAR framework to universally available structured EHR data; extensive methodological comparisons demonstrating its superiority over standard supervised baselines have been detailed in our prior work [<xref ref-type="bibr" rid="ref24">24</xref>,<xref ref-type="bibr" rid="ref33">33</xref>].</p><p>EHR data may contain various types of self-harm events, such as hanging, poisoning, cutting, etc, and the likelihood of coding these different types of self-harm may differ due to the severity of the condition and the source and nature of the underlying trauma. Therefore, it becomes evident that the SCAR assumption may not hold for self-harm data, and SCAR-based PU methods may not be suitable for such data. Recognizing this limitation, we applied the PULSNAR algorithm to estimate the proportion of self-harm-uncoded patients in an EHR dataset of US Veterans.</p><p>When dealing with highly class-imbalanced training datasets, ML algorithms often exhibit a bias toward the majority class, struggling to effectively generalize patterns from the minority class, which yields biased predictions [<xref ref-type="bibr" rid="ref34">34</xref>]. Since only a tiny fraction of records were coded for self-harm in EHR data, we adopted a random undersampling approach [<xref ref-type="bibr" rid="ref35">35</xref>], where we transformed the imbalanced dataset into <italic>k</italic> balanced datasets (as illustrated in <xref ref-type="fig" rid="figure1">Figure 1</xref>), where <italic>k=floor</italic>(|unlabeled|/|labeled|). Each balanced dataset comprised all labeled self-harm records along with a similar number of unlabeled records. Subsequently, we applied PULSCAR and PULSNAR algorithms (with XGBoost as the classifier) to each balanced dataset to estimate the proportion (<italic>&#x03B1;</italic>) of self-harm among the unlabeled records. The final <italic>&#x03B1;</italic> was determined by computing the mean of the <italic>k</italic> estimates of <italic>&#x03B1;</italic>. We performed hyperparameter tuning to determine optimal values for the XGBoost parameters. To better explore the high-dimensional covariate space, we used aggressive feature subsampling combined with a low learning rate and a large ensemble of deeper trees. This configuration limits early dominance by a small set of predictors and promotes broader exploration of the covariate space. With this sampling scheme, deeper trees provide more opportunities for additional variables to enter each tree and enable the model to capture complex interactions among covariates. Although deeper trees may increase the risk of overfitting, the large sample size and cross-validation procedures mitigate this risk. The final XGBoost parameter values were <italic>max_depth</italic>=12, <italic>n_estimators</italic>=400, <italic>learning_rate</italic>=0.05, <italic>min_child_weight</italic>=1, <italic>colsample_bytree</italic>=0.3, <italic>colsample_bylevel</italic>=0.6, <italic>n_jobs</italic>=32, <italic>random_state</italic>=0, and <italic>objective=&#x2018;binary:logistic&#x2019;</italic>. Other parameters were kept at their default values.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Steps for <italic>&#x03B1;</italic> estimation when the dataset has high class imbalance. The mix of blue and red represents the unlabeled set (U), and blue represents the positive set (P). To address class imbalance, <italic>k</italic> balanced datasets are generated, where <italic>k</italic>=<italic>floor</italic>(|unlabeled|/|labeled|). Each balanced dataset includes all labeled records and a randomly selected subset of unlabeled records of comparable size. The PU algorithm is applied to each of the <italic>k</italic> balanced datasets to estimate the proportion (<italic>&#x03B1;</italic>) of positives within the unlabeled set. The final <italic>&#x03B1;</italic> estimate is obtained by averaging the <italic>k</italic> individual <italic>&#x03B1;</italic> estimates. PULSCAR: positive unlabeled learning selected completely at random; PULSNAR: positive unlabeled learning selected not at random.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v28i1e89071_fig01.png"/></fig></sec><sec id="s2-3"><title>Data Source</title><p>For this study, we used the Veterans Health Administration (VHA) EHR data (from October 1, 1999, to August 31, 2019) available in the OMOP CDMv5 (Observational Medical Outcomes Partnership Common Data Model) [<xref ref-type="bibr" rid="ref36">36</xref>] format, selecting a random 25% sample of 1,329,120 Veterans (out of 5,316,480). The only inclusion criterion was that Veterans had at least 2 years of enrollment. Ethical approvals and data use agreements were obtained from the appropriate institutional review boards to ensure compliance with privacy and confidentiality regulations.</p></sec><sec id="s2-4"><title>Phenotyping and Covariate Selection</title><p>A self-harm phenotype was defined by the presence of one or more <italic>ICD-10-CM</italic> (<italic>International Classification of Diseases, 10th Revision, Clinical Modification</italic>) or <italic>ICD-9-CM</italic> codes (Table S1 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). These codes encompass all instances of intentional self-harm or suicide attempts by any means, including a history of self-harm. Patients with any of these codes were labeled as positive cases, while all others remained unlabeled. Accurate identification of self-harm events in structured data is complicated by self-directed violence nomenclature and coding rules that require documentation of clinical intent. Injury and poisoning codes must distinguish accidental events, intentional self-harm, assault, or undetermined causes, and in the absence of clear documentation, intent is often coded as accidental. These challenges are not unique to VHA; they affect coding practices in non-VHA systems and research cohorts as well.</p><p>Covariates included patient age at enrollment and the presence/absence of medical conditions, procedures, and clinical observations over the duration of patient observation. <italic>ICD-9-CM</italic> and <italic>ICD-10-CM</italic> diagnosis codes were mapped to their Systematized Nomenclature of Medicine (SNOMED) equivalents (and all ancestors thereof) using the OMOP vocabulary [<xref ref-type="bibr" rid="ref37">37</xref>]. Procedure codes from <italic>ICD-9-CM Volume 3</italic> (<italic>ICD-9-CM V3</italic>), the <italic>ICD-10 Procedure Coding System</italic> (<italic>ICD-10-PCS</italic>), and <italic>Current Procedural Terminology, Fourth Edition</italic>, were mapped to <italic>ICD-10-PCS</italic> concepts (and all ancestors thereof). Overall, from a dataset of 1,329,120 Veterans, we selected 159,049 covariates for use in the PU learning algorithm. Codes used to define the self-harm phenotype (Table S1 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>) were excluded from the covariate list to prevent data leakage. Since each patient had an average of only 1203 nonzero features, a compressed sparse row (CSR) matrix with 1,329,120 rows (patients) and 159,049 columns (covariates) was created as input for the PU models. Covariates were encoded as binary values (0/1) in the CSR matrix; if a covariate was not present in an individual&#x2019;s data, it was set to 0, and if it was present, it was set to 1. Therefore, no covariate included in the modeling framework contained missing values. As a result, tests for missingness (eg, missing completely at random testing) and multiple imputation procedures were not applicable for our study. <xref ref-type="fig" rid="figure2">Figure 2</xref> illustrates the complete schema of our study.</p><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>Study schema. First, <italic>k</italic> balanced datasets were generated, and the PU learning method was applied to each of them. The parameter <italic>&#x03B1;<sub>j</sub></italic> was estimated for each balanced dataset, and the final <italic>&#x03B1;</italic> for the complete dataset was computed by taking the mean of the <italic>k</italic> estimated values <italic>&#x03B1;</italic><sub>1</sub><italic>,...,&#x03B1;<sub>k</sub></italic>. Using the estimated <italic>&#x03B1;</italic>, the PU method calculated the calibrated probability of being labeled as positive for each uncoded individual. These calibrated classifications were then used to determine probable positive (PP) and probable negative (PN) individuals among the uncoded individuals. CP: coded positive; OMOP: Observational Medical Outcomes Partnership; PULSNAR: positive unlabeled learning selected not at random; VHA: Veterans Health Administration.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v28i1e89071_fig02.png"/></fig></sec><sec id="s2-5"><title>Chart Review Process</title><p>To validate our model&#x2019;s classifications, we conducted a chart review of a random sample of unlabeled individuals whose calibrated probabilities fell into each 1% probability bin. Due to the absence of unlabeled individuals in the calibrated probability bins in the first and last two percentiles, the selection was limited to 97 individuals. To ensure rigorous clinical validity, a comprehensive set of search keywords was established a priori in direct collaboration with our clinical psychiatrist coauthor (GV). Four informaticists (coauthors CGB, DvdG, JJY, and CGL) used these clinically vetted keywords and search utilities to identify and extract potential evidence of suicidal or nonsuicidal self-harm from combined charts, which often exceeded 500,000 lines per patient. Interrater reliability was assessed at this stage to ensure consistency among the informatics reviewers. Crucially, the extracted clinical evidence for each of the 97 cases was subsequently evaluated and discussed in detail with the psychiatrist. Rather than limiting clinical oversight to disagreement resolution among informaticists, this joint case-by-case review ensured that definitional and semantic complexities (eg, distinguishing when chronic substance abuse strictly constitutes intentional self-harm) were uniformly adjudicated by a mental health clinician. Through this expert-guided process, individuals were manually classified as positive (class 1) or negative (class 0) for ever self-harm. Finally, we compared the sum of the PULSNAR-calibrated probabilities of the 97 individuals with the sum of their consensus labels identified through expert chart review.</p></sec><sec id="s2-6"><title>Calibrated Self-Harm Prevalence Estimates for Veterans</title><p>To calibrate the PULSNAR-estimated fraction of patients with self-harm against consensus chart review results, we applied a bias-only logit shift&#x2014;a logistic transformation that adds a uniform constant offset (<italic>c</italic>) to the logits of the original probabilities without altering their ordering. Mathematically, for each probability <italic>p<sub>i</sub>,</italic> the logit is computed as  <inline-formula><mml:math id="ieqn1"><mml:mstyle><mml:mrow><mml:mstyle displaystyle="false"><mml:msub><mml:mi>l</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo>=</mml:mo><mml:mi>log</mml:mi><mml:mo>&#x2061;</mml:mo><mml:mrow><mml:mo>(</mml:mo><mml:mfrac><mml:msub><mml:mi>p</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mrow><mml:mn>1</mml:mn><mml:mo>&#x2212;</mml:mo><mml:msub><mml:mi>p</mml:mi><mml:mi>i</mml:mi></mml:msub></mml:mrow></mml:mfrac><mml:mo>)</mml:mo></mml:mrow></mml:mstyle></mml:mrow></mml:mstyle></mml:math></inline-formula>; <italic>c</italic> is solved numerically (via root-finding) such that <inline-formula><mml:math id="ieqn2"><mml:mstyle><mml:mrow><mml:mstyle displaystyle="false"><mml:munderover><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>0</mml:mn></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">|</mml:mo></mml:mrow><mml:mi>U</mml:mi><mml:mrow><mml:mo stretchy="false">|</mml:mo></mml:mrow></mml:mrow></mml:munderover><mml:mfrac><mml:mn>1</mml:mn><mml:mrow><mml:mn>1</mml:mn><mml:mo>+</mml:mo><mml:mi>exp</mml:mi><mml:mo>&#x2061;</mml:mo><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:mo>&#x2212;</mml:mo><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>l</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo>+</mml:mo><mml:mi>c</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>)</mml:mo></mml:mrow></mml:mrow></mml:mfrac></mml:mstyle></mml:mrow></mml:mstyle></mml:math></inline-formula> is equal to the number of positives confirmed by chart review. The adjusted probabilities are <inline-formula><mml:math id="ieqn3"><mml:mstyle><mml:mrow><mml:mstyle displaystyle="false"><mml:msub><mml:mrow><mml:mover><mml:mi>p</mml:mi><mml:mo stretchy="false">^</mml:mo></mml:mover></mml:mrow><mml:mi>i</mml:mi></mml:msub><mml:mo>=</mml:mo><mml:mfrac><mml:mn>1</mml:mn><mml:mrow><mml:mn>1</mml:mn><mml:mo>+</mml:mo><mml:mi>exp</mml:mi><mml:mo>&#x2061;</mml:mo><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:mo>&#x2212;</mml:mo><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>l</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo>+</mml:mo><mml:mrow><mml:mover><mml:mi>c</mml:mi><mml:mo stretchy="false">^</mml:mo></mml:mover></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>)</mml:mo></mml:mrow></mml:mrow></mml:mfrac></mml:mstyle></mml:mrow></mml:mstyle></mml:math></inline-formula>. This method preserves relative ordering and can shift probabilities lower when correcting overestimation, providing exact sum alignment to the gold standard. We quantified calibration uncertainty using a 100,000-sample bootstrap, sampling chart-reviewed observations with replacement, solving for <italic>c</italic> each time to produce an empirical distribution. The 2.5th and 97.5th percentiles formed the 95% CI for <italic><inline-formula><mml:math id="ieqn4"><mml:mstyle><mml:mrow><mml:mstyle displaystyle="false"><mml:mrow><mml:mover><mml:mi>c</mml:mi><mml:mo stretchy="false">^</mml:mo></mml:mover></mml:mrow></mml:mstyle></mml:mrow></mml:mstyle></mml:math></inline-formula></italic>, which we then applied to the unlabeled population to bound the total estimated positives.</p></sec><sec id="s2-7"><title>PULSNAR Classification vs Expert Chart Review</title><p>The probability of a self-harm diagnosis was stratified into low (probability &#x003C; 25%), intermediate (25% &#x2264; probability &#x2264; 75%), or high (probability &#x003E; 75%) categories. These stratified probability categories (which also serve as a proxy measure for the underlying behavioral risk or detection and identification of such) were compared with the outcome of the chart reviews stratified into unanimous decision against self-harm (ie, all reviewers agreed that no evidence of self-harm was found in the patient chart), dissenting evidence for self-harm (ie, some but not all reviewers agreed on self-harm), and unanimous decision for self-harm (ie, all reviewers agreed on self-harm).</p></sec><sec id="s2-8"><title>Ethical Considerations</title><p>Ethical approval for this study was obtained from the institutional review board and research and development committees of the VA New Mexico Healthcare System (H2947), University of New Mexico Health Sciences Center (20-477), and VA Tennessee Valley Healthcare System (#1576576). The requirement for informed consent was waived because the study involved secondary analysis of existing electronic health record data and posed minimal risk to participants. All data were deidentified prior to analysis, and access to the data was restricted to authorized study personnel and maintained on secure, password-protected VA servers to ensure privacy and confidentiality. No participants were contacted for this study, and no compensation was provided. No identifiable individual-level information or images are included in this manuscript or supplementary materials.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>Patient Characteristics</title><p>Applying our inclusion and exclusion criteria, we identified 1,329,120 individuals (1,193,563 males and 135,557 females) for the study. <xref ref-type="table" rid="table1">Table 1</xref> summarizes the key demographic and clinical characteristics of the study population.</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Demographic and clinical characteristics of patients with and without coded self-harm selected from Veterans Health Administration electronic health record data (October 1, 1999, to August 31, 2019). The listed comorbidities represent key covariates identified as important features by the XGBoost<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup> model within the PULSNAR<sup><xref ref-type="table-fn" rid="table1fn2">b</xref></sup> algorithm.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="top">Patient characteristics (n=1,329,120)</td><td align="left" valign="top" colspan="2">Coded for self-harm (n=24,625, 1.85%)</td><td align="left" valign="top" colspan="2">Uncoded for self-harm (n=1,304,495, 98.15%)</td><td align="left" valign="top">OR<sup><xref ref-type="table-fn" rid="table1fn3">c</xref></sup> (95% CI)</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Patients, n (%)</td><td align="char" char="." valign="top">95% CI</td><td align="left" valign="top">Patients, n (%)</td><td align="char" char="." valign="top">95% CI</td><td align="left" valign="top"/></tr></thead><tbody><tr><td align="left" valign="top">Male</td><td align="left" valign="top">20,723 (84.15)</td><td align="left" valign="top">83.69&#x2010;84.61</td><td align="left" valign="top">1,172,840 (89.91)</td><td align="left" valign="top">89.86&#x2010;89.96</td><td align="left" valign="top">0.60 (0.58&#x2010;0.62)</td></tr><tr><td align="left" valign="top">Female</td><td align="left" valign="top">3902 (15.85)</td><td align="left" valign="top">15.39&#x2010;16.31</td><td align="left" valign="top">131,655 (10.09)</td><td align="left" valign="top">10.04&#x2010;10.14</td><td align="left" valign="top">1.68 (1.62&#x2010;1.74)</td></tr><tr><td align="left" valign="top">Age (years), mean (SD)</td><td align="left" valign="top">38.39 (12.17)</td><td align="left" valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table1fn4">d</xref></sup></td><td align="left" valign="top">48.76 (15.04)</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top">Age (years)</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="char" char="." valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>0&#x2010;19</td><td align="left" valign="top">1301 (5.28)</td><td align="left" valign="top">5.01&#x2010;5.57</td><td align="left" valign="top">32,128 (2.46)</td><td align="left" valign="top">2.44&#x2010;2.49</td><td align="left" valign="top">2.21 (2.09&#x2010;2.34)</td></tr><tr><td align="char" char="." valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>20&#x2010;29</td><td align="left" valign="top">6126 (24.88)</td><td align="left" valign="top">24.34&#x2010;25.42</td><td align="left" valign="top">168,060 (12.88)</td><td align="left" valign="top">12.83&#x2010;12.94</td><td align="left" valign="top">2.24 (2.17&#x2010;2.31)</td></tr><tr><td align="char" char="." valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>30&#x2010;39</td><td align="left" valign="top">5424 (22.03)</td><td align="left" valign="top">21.51&#x2010;22.55</td><td align="left" valign="top">164,091 (12.58)</td><td align="left" valign="top">12.52&#x2010;12.64</td><td align="left" valign="top">1.96 (1.90&#x2010;2.02)</td></tr><tr><td align="char" char="." valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>40&#x2010;49</td><td align="left" valign="top">7261 (29.49)</td><td align="left" valign="top">28.92&#x2010;30.06</td><td align="left" valign="top">257,077 (19.71)</td><td align="left" valign="top">19.64&#x2010;19.78</td><td align="left" valign="top">1.70 (1.66&#x2010;1.75)</td></tr><tr><td align="char" char="." valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>50&#x2010;59</td><td align="left" valign="top">3712 (15.07)</td><td align="left" valign="top">14.63&#x2010;15.53</td><td align="left" valign="top">365,610 (28.03)</td><td align="left" valign="top">27.95&#x2010;28.10</td><td align="left" valign="top">0.46 (0.44&#x2010;0.47)</td></tr><tr><td align="char" char="." valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>&#x2265;60</td><td align="left" valign="top">801 (3.25)</td><td align="left" valign="top">3.03&#x2010;3.48</td><td align="left" valign="top">317,529 (24.34)</td><td align="left" valign="top">24.27&#x2010;24.41</td><td align="left" valign="top">0.10 (0.10&#x2010;0.11)</td></tr><tr><td align="left" valign="top">Comorbidities</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Personality disorder</td><td align="left" valign="top">2536 (10.30)</td><td align="left" valign="top">9.92&#x2010;10.68</td><td align="left" valign="top">5627 (0.43)</td><td align="left" valign="top">0.42&#x2010;0.44</td><td align="left" valign="top">26.48 (25.22&#x2010;27.82)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Bipolar disorder</td><td align="left" valign="top">2374 (9.64)</td><td align="left" valign="top">9.27&#x2010;10.02</td><td align="left" valign="top">5887 (0.45)</td><td align="left" valign="top">0.44&#x2010;0.46</td><td align="left" valign="top">23.53 (22.38&#x2010;24.72)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Schizophrenia</td><td align="left" valign="top">944 (3.83)</td><td align="left" valign="top">3.60&#x2010;4.08</td><td align="left" valign="top">2277 (0.17)</td><td align="left" valign="top">0.17&#x2010;0.18</td><td align="left" valign="top">22.79 (21.09&#x2010;24.64)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Major depressive disorder</td><td align="left" valign="top">4412 (17.92)</td><td align="left" valign="top">17.44&#x2010;18.40</td><td align="left" valign="top">13,572 (1.04)</td><td align="left" valign="top">1.02&#x2010;1.06</td><td align="left" valign="top">20.76 (20.00&#x2010;21.53)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Psychoactive substance use disorder</td><td align="left" valign="top">3783 (15.36)</td><td align="left" valign="top">14.91&#x2010;15.82</td><td align="left" valign="top">10,933 (0.84)</td><td align="left" valign="top">0.82&#x2010;0.85</td><td align="left" valign="top">21.47 (20.63&#x2010;22.34)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Posttraumatic stress disorder</td><td align="left" valign="top">3563 (14.47)</td><td align="left" valign="top">14.03&#x2010;14.91</td><td align="left" valign="top">10,369 (0.79)</td><td align="left" valign="top">0.78&#x2010;0.81</td><td align="left" valign="top">21.10 (20.27&#x2010;22.00)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Attention-deficit/hyperactivity disorder</td><td align="left" valign="top">509 (2.07)</td><td align="left" valign="top">1.89&#x2010;2.25</td><td align="left" valign="top">1266 (0.10)</td><td align="left" valign="top">0.09&#x2010;0.10</td><td align="left" valign="top">21.72 (19.56&#x2010;24.11)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Chronic pain</td><td align="left" valign="top">2538 (10.31)</td><td align="left" valign="top">9.93&#x2010;10.69</td><td align="left" valign="top">6977 (0.53)</td><td align="left" valign="top">0.52&#x2010;0.55</td><td align="left" valign="top">21.37 (20.37&#x2010;22.41)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Adjustment disorder</td><td align="left" valign="top">2748 (11.16)</td><td align="left" valign="top">10.77&#x2010;11.56</td><td align="left" valign="top">7763 (0.60)</td><td align="left" valign="top">0.58&#x2010;0.61</td><td align="left" valign="top">20.98 (20.03&#x2010;21.97)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Sleep disorder</td><td align="left" valign="top">3570 (14.50)</td><td align="left" valign="top">14.06&#x2010;14.94</td><td align="left" valign="top">10,744 (0.82)</td><td align="left" valign="top">0.81&#x2010;0.84</td><td align="left" valign="top">20.41 (19.59&#x2010;21.25)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Traumatic brain injury</td><td align="left" valign="top">56 (0.23)</td><td align="left" valign="top">0.17&#x2010;0.30</td><td align="left" valign="top">101 (0.01)</td><td align="left" valign="top">0.01&#x2010;0.01</td><td align="left" valign="top">29.45 (20.84&#x2010;41.21)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Anxiety disorder</td><td align="left" valign="top">4424 (17.97)</td><td align="left" valign="top">17.49&#x2010;18.45</td><td align="left" valign="top">13,855 (1.06)</td><td align="left" valign="top">1.04&#x2010;1.08</td><td align="left" valign="top">20.40 (19.66&#x2010;21.16)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Induced psychotic disorder</td><td align="left" valign="top">21 (0.09)</td><td align="left" valign="top">0.05&#x2010;0.13</td><td align="left" valign="top">34 (0.00)</td><td align="left" valign="top">0.00&#x2010;0.00</td><td align="left" valign="top">32.75 (18.05&#x2010;58.10)</td></tr></tbody></table><table-wrap-foot><fn id="table1fn1"><p><sup>a</sup>XGBoost: Extreme Gradient Boosting.</p></fn><fn id="table1fn2"><p><sup>b</sup>PULSNAR: positive unlabeled learning selected not at random.</p></fn><fn id="table1fn3"><p><sup>c</sup>OR: odds ratio.</p></fn><fn id="table1fn4"><p><sup>d</sup>Not applicable.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s3-2"><title>Performance of PU Models</title><p><xref ref-type="table" rid="table2">Table 2</xref> presents the estimated proportion of self-harm cases by the PULSCAR and PULSNAR algorithms. The proportion of individuals coded for self-harm was only 1.85% (24,625/1,329,120). As expected, PULSCAR provided a lower <italic>&#x03B1;</italic> estimate (21,524/1,304,495, 1.65%) compared to PULSNAR (114,404/1,304,495, 8.77%), because the EHR self-harm data of Veterans did not satisfy the SCAR assumption. PULSCAR estimated 3.47% (46,120/1,329,120) of the individuals with coded and imputed self-harm, while PULSNAR provided a higher estimate of 10.46% (139,026/1,329,120).</p><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Performance of the PULSCAR<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup> and PULSNAR<sup><xref ref-type="table-fn" rid="table2fn2">b</xref></sup> algorithms in imputing uncoded self-harm cases from Veterans Health Administration (VHA) electronic health record data. The parameter <italic>&#x03B1;</italic> denotes the estimated proportion of uncoded self-harm cases identified by each method.</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom"/><td align="left" valign="bottom">VHA (ever self-harm)</td></tr></thead><tbody><tr><td align="left" valign="top">Records, n</td><td align="left" valign="top">1,329,120</td></tr><tr><td align="left" valign="top">Coded self-harm</td><td align="left" valign="top">24,625 (1.85%, 95% CI 1.83%&#x2010;1.88%)</td></tr><tr><td align="left" valign="top">Covariates, n</td><td align="left" valign="top">159,049</td></tr><tr><td align="left" valign="top">Covariate types</td><td align="left" valign="top">3 (condition, procedure, and observation)</td></tr><tr><td align="left" valign="top">Important covariates, n</td><td align="left" valign="top">1302</td></tr><tr><td align="left" valign="top"><italic>&#x03B1;</italic> using PULSCAR</td><td align="left" valign="top">1.65%</td></tr><tr><td align="left" valign="top"><italic>&#x03B1;</italic> using PULSNAR</td><td align="left" valign="top">8.77%</td></tr><tr><td align="left" valign="top">Coded+imputed self-harm using PULSCAR</td><td align="left" valign="top">3.47%</td></tr><tr><td align="left" valign="top">Coded+imputed self-harm using PULSNAR</td><td align="left" valign="top">10.46% (14 clusters)</td></tr></tbody></table><table-wrap-foot><fn id="table2fn1"><p><sup>a</sup>PULSCAR: positive unlabeled learning selected completely at random.</p></fn><fn id="table2fn2"><p><sup>b</sup>PULSNAR: positive unlabeled learning selected not at random.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s3-3"><title>Important Covariates Used by the XGBoost Model</title><p>Our covariate selection approach identified 159,049 covariates for the XGBoost model used in the PULSCAR and PULSNAR methods, but XGBoost identified only 1302 (0.82%) as informative for classification. Feature (covariate) importance was measured by the average gain (mean reduction in the loss function) contributed by each covariate. The top 15 covariates with the highest gain scores are shown in <xref ref-type="table" rid="table3">Table 3</xref>.</p><table-wrap id="t3" position="float"><label>Table 3.</label><caption><p>Top 15 covariates identified by the XGBoost<sup><xref ref-type="table-fn" rid="table3fn1">a</xref></sup> model, along with their corresponding gain scores. Of the 159,049 covariates included in the positive and unlabeled models based on XGBoost, only 1302 (0.82%) were identified by the model as contributing to training in our cohort. Feature (covariate) importance was quantified using average gain, defined as the mean reduction in the model&#x2019;s loss function attributable to splits on a given covariate across all trees; thus, higher gain values indicate a greater contribution to improving the model&#x2019;s predictive performance.</p></caption><table id="table3" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">OMOP<sup><xref ref-type="table-fn" rid="table3fn2">b</xref></sup> concept ID</td><td align="left" valign="bottom">Concept name</td><td align="left" valign="bottom">Gain score</td></tr></thead><tbody><tr><td align="left" valign="top">438028</td><td align="left" valign="top">Poisoning by a drug AND/OR a medicinal substance</td><td align="left" valign="top">21.61</td></tr><tr><td align="left" valign="top">4009713</td><td align="left" valign="top">Thoughts of self-harm</td><td align="left" valign="top">14.62</td></tr><tr><td align="left" valign="top">440270</td><td align="left" valign="top">Poisoning by antidepressants</td><td align="left" valign="top">11.18</td></tr><tr><td align="left" valign="top">2840354</td><td align="left" valign="top">Medical and Surgical @ Anatomical Regions, Upper Extremities @ Supplement @ Axilla, Left @ Open @ Autologous Tissue Substitute</td><td align="left" valign="top">10.68</td></tr><tr><td align="left" valign="top">44782421</td><td align="left" valign="top">Acute deep venous thrombosis of the upper extremity</td><td align="left" valign="top">10.50</td></tr><tr><td align="left" valign="top">4133169</td><td align="left" valign="top">Operative procedure on the pelvis</td><td align="left" valign="top">10.44</td></tr><tr><td align="left" valign="top">434626</td><td align="left" valign="top">Borderline personality disorder</td><td align="left" valign="top">9.87</td></tr><tr><td align="left" valign="top">4306645</td><td align="left" valign="top">Finding of thought content</td><td align="left" valign="top">9.73</td></tr><tr><td align="left" valign="top">2887059</td><td align="left" valign="top">Medical and Surgical @ Gastrointestinal System @ Bypass @ Descending Colon @ Open</td><td align="left" valign="top">9.59</td></tr><tr><td align="left" valign="top">4181019</td><td align="left" valign="top">Cluster B personality disorder</td><td align="left" valign="top">9.49</td></tr><tr><td align="left" valign="top">2832323</td><td align="left" valign="top">Medical and Surgical @ Anatomical Regions, General @ Repair @ Abdominal Wall @ Percutaneous @ No Device</td><td align="left" valign="top">9.47</td></tr><tr><td align="left" valign="top">2841968</td><td align="left" valign="top">Imaging @ Lower Arteries @ Fluoroscopy @ Abdominal Aorta @ Low Osmolar</td><td align="left" valign="top">9.30</td></tr><tr><td align="left" valign="top">2762368</td><td align="left" valign="top">Supplement Thoracic Vertebra with Nonautologous Tissue Substitute, Percutaneous Approach</td><td align="left" valign="top">9.27</td></tr><tr><td align="left" valign="top">437456</td><td align="left" valign="top">Poisoning by an anticonvulsant</td><td align="left" valign="top">8.90</td></tr><tr><td align="left" valign="top">2895851</td><td align="left" valign="top">Medical and Surgical @ Upper Bones @ Excision @ Clavicle, Left @ Open</td><td align="left" valign="top">8.87</td></tr></tbody></table><table-wrap-foot><fn id="table3fn1"><p><sup>a</sup>XGBoost: Extreme Gradient Boosting.</p></fn><fn id="table3fn2"><p><sup>b</sup>OMOP: Observational Medical Outcomes Partnership.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s3-4"><title>Chart Review</title><p><xref ref-type="table" rid="table4">Table 4</xref> presents the interreviewer agreement among 4 reviewers across 97 uncoded self-harm cases, as well as the agreement between individual reviewers and PULSNAR, and between reviewers and the overall reviewer consensus. Table S2 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> reports the pairwise Cohen &#x03BA; coefficients for these agreements. Interreviewer reliability among 4 reviewers across 97 uncoded self-harm cases was substantial (Fleiss &#x03BA;=0.668, <italic>z</italic>=16.1, <italic>P</italic>&#x003C;.001). Notably, there were 11 patients of 39 (28.2%) where the consensus review was positive, but at least one reviewer missed the relevant evidence. Of 4&#x00D7;39=156 evaluations of positive charts, 17 (10.9%) were false negatives.</p><table-wrap id="t4" position="float"><label>Table 4.</label><caption><p>Agreement between PULSNAR<sup><xref ref-type="table-fn" rid="table4fn1">a</xref></sup> and each reviewer, between individual reviewers, between each reviewer and their consensus, and between the reviewers&#x2019; consensus and PULSNAR for 97 uncoded self-harm cases. Agreement is calculated as the percentage agreeing out of the 97 charts. Overall interreviewer reliability was substantial (Fleiss &#x03BA;=0.668, <italic>z</italic>=16.1, <italic>P</italic>&#x003C;.001). Among 39 consensus-positive cases, 11 (28.2%) had at least one reviewer miss relevant evidence.</p></caption><table id="table4" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom"/><td align="left" valign="bottom">Reviewer 1</td><td align="left" valign="bottom">Reviewer 2</td><td align="left" valign="bottom">Reviewer 3</td><td align="left" valign="bottom">Reviewer 4</td><td align="left" valign="bottom">PULSNAR</td><td align="left" valign="bottom">Reviewer consensus</td></tr></thead><tbody><tr><td align="left" valign="top">Reviewer 1</td><td align="left" valign="top">100.0%</td><td align="left" valign="top">89.7%</td><td align="left" valign="top">94.9%</td><td align="left" valign="top">82.5%</td><td align="left" valign="top">58.8%</td><td align="left" valign="top">88.7%</td></tr><tr><td align="left" valign="top">Reviewer 2</td><td align="left" valign="top">89.7%</td><td align="left" valign="top">100.0%</td><td align="left" valign="top">88.7%</td><td align="left" valign="top">76.3%</td><td align="left" valign="top">54.6%</td><td align="left" valign="top">82.5%</td></tr><tr><td align="left" valign="top">Reviewer 3</td><td align="left" valign="top">94.9%</td><td align="left" valign="top">88.7%</td><td align="left" valign="top">100.0%</td><td align="left" valign="top">79.4%</td><td align="left" valign="top">59.8%</td><td align="left" valign="top">87.6%</td></tr><tr><td align="left" valign="top">Reviewer 4</td><td align="left" valign="top">82.5%</td><td align="left" valign="top">76.3%</td><td align="left" valign="top">79.4%</td><td align="left" valign="top">100.0%</td><td align="left" valign="top">63.9%</td><td align="left" valign="top">85.6%</td></tr><tr><td align="left" valign="top">PULSNAR</td><td align="left" valign="top">58.8%</td><td align="left" valign="top">54.6%</td><td align="left" valign="top">59.8%</td><td align="left" valign="top">63.9%</td><td align="left" valign="top">100.0%</td><td align="left" valign="top">59.8%</td></tr><tr><td align="left" valign="top">Reviewer consensus</td><td align="left" valign="top">88.7%</td><td align="left" valign="top">82.5%</td><td align="left" valign="top">87.6%</td><td align="left" valign="top">85.6%</td><td align="left" valign="top">59.8%</td><td align="left" valign="top">100.0%</td></tr></tbody></table><table-wrap-foot><fn id="table4fn1"><p><sup>a</sup>PULSNAR: positive unlabeled learning selected not at random.</p></fn></table-wrap-foot></table-wrap><p>For completeness, we also calculated standard classification metrics on the 97 chart-reviewed cases using the PULSNAR probabilities versus expert consensus as the ground truth. The model demonstrated an area under the curve (AUC) of 0.6813, an <italic>F</italic><sub>1</sub>-score of 0.5517, a precision of 0.5, and a recall of 0.6154.</p></sec><sec id="s3-5"><title>Post Hoc Calibration</title><p>After applying the bias-only logit shift on the PULSNAR probabilities for the 97 reviewed charts, using their consensus chart-reviewed self-harm labels, the sum of transformed PULSNAR probabilities equaled 39 (the expert count), with a logit shift parameter of <italic>c</italic>=&#x2212;0.54 (bootstrap 95% CI &#x2212;1.1420 to &#x2212;0.0252). Applying this transformation to the 1,304,495 unlabeled patients yielded a sum of adjusted probabilities of 80,574.7, corresponding to 6.18% positives among the uncoded patients, which is lower than the PULSNAR-estimated <italic>&#x03B1;</italic> of 8.77%. This suggests that, if all notes for all Veterans without coded self-harm were chart-reviewed, 6.18% (95% CI 4.1%-8.74%) would reveal a documented history of self-harm. Thus, coded self-harm represents approximately 24,625/(80,574.7+24,625)=23.4% (95% CI 17.76%-31.51%) of all documented (coded+notes) self-harm.</p></sec><sec id="s3-6"><title>Efficiency/Scalability Quantification</title><p>To quantify the operational utility and scalability of the PULSNAR approach, we evaluated both the manual effort required for expert chart review and the computational resources consumed by our model inference across the full cohort. Expert chart review time ranged from approximately 25 minutes to 2 hours per patient chart, depending on the complexity of the record and the volume of clinical notes. Across the 97 charts reviewed in this study, this corresponds to an estimated total effort of approximately 40&#x2010;200 person-hours per reviewer. In contrast, running the PULSNAR model for inference on the entire cohort of 1.3 million records was computationally tractable, requiring approximately 63 hours of wall-clock time on a machine with 16 vCPUs, 128 GB of RAM, and 128 GB of disk space.</p></sec><sec id="s3-7"><title>PULSNAR Classification vs Expert Chart Review</title><p><xref ref-type="table" rid="table5">Table 5</xref> shows the comparison of PULSNAR-classified self-harm risk categories with chart review outcomes. Among those Veterans for whom expert reviewers could not find evidence of self-harm in some patient charts, PULSNAR classified the probability of self-harm as low in 35.42% of the cases and found it to be intermediate or high in the remaining 64.58% of the cases. Classifying patients as intermediate or high risk despite the absence of self-harm coding is a desirable feature, as the patient chart might also not document self-harm. Among those Veterans for whom some, but not all, expert reviewers agreed on recorded self-harm behavior in the patient chart, PULSNAR would classify the probability of self-harm to be low only in 14.81% of the cases, and intermediate to high in 85.19% of the cases. Among those Veterans for whom the expert reviewers unanimously agreed on evidence of self-harm behavior in the patient chart, PULSNAR estimated the probability of self-harming behavior as low only in 13.64% of the cases, and intermediate to high in 86.36% of the cases. Stratified as such, there was an association between expert chart reviewers and PULSNAR (Fisher exact test: <italic>P</italic>=.02).</p><table-wrap id="t5" position="float"><label>Table 5.</label><caption><p>Comparison of PULSNAR<sup><xref ref-type="table-fn" rid="table5fn1">a</xref></sup>-classified self-harm risk categories (low, intermediate, and high probability) with expert chart review outcomes (unanimous against self-harm, dissenting evidence for self-harm, and unanimous for self-harm) among 97 chart-reviewed uncoded cases. There was a statistically significant association between PULSNAR risk categories and chart review outcomes (Fisher exact test, <italic>P</italic>=.02).</p></caption><table id="table5" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Expert review</td><td align="left" valign="bottom" colspan="3">PULSNAR classification</td></tr><tr><td align="left" valign="bottom"/><td align="left" valign="bottom">Low</td><td align="left" valign="bottom">Intermediate</td><td align="left" valign="bottom">High</td></tr></thead><tbody><tr><td align="left" valign="top">Unanimous no self-harm</td><td align="left" valign="top">35.42</td><td align="left" valign="top">50.00</td><td align="left" valign="top">14.58</td></tr><tr><td align="left" valign="top">Dissenting self-harm</td><td align="left" valign="top">14.81</td><td align="left" valign="top">40.74</td><td align="left" valign="top">44.44</td></tr><tr><td align="left" valign="top">Unanimous self-harm</td><td align="left" valign="top">13.64</td><td align="left" valign="top">68.18</td><td align="left" valign="top">18.18</td></tr></tbody></table><table-wrap-foot><fn id="table5fn1"><p><sup>a</sup>PULSNAR: positive unlabeled learning selected not at random.</p></fn></table-wrap-foot></table-wrap></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Results</title><p>In this study, we successfully applied the novel PULSNAR algorithm to a large representative cohort of US Veterans&#x2019; electronic health records to estimate the burden of self-harm beyond what is captured in structured diagnostic codes. Our primary finding is that relying solely on structured diagnostic codes dramatically underestimates the clinically documented prevalence of self-harm. By using only structured data within a PU learning framework and calibrating predictions against expert chart review of clinical notes, our approach provides a more comprehensive estimate of self-harm prevalence and highlights the extent of undercoding in VA electronic health records. These results support our central hypothesis that PU learning can recover hidden disease burden in routinely collected health care data. Critically, our results suggest that only about 1 in 4 patients with clinical notes documenting self-harm, or a history thereof, have the condition captured in structured diagnostic codes. While the discrimination metrics (AUC=0.68) appear modest, this is primarily because the model relies solely on structured data for prediction, while the validation ground truth is derived from a comprehensive review of free-text clinical notes. Furthermore, the nonrandom, uncoded test set, which lacks confirmed negatives, is optimized not for discrimination assessment, but rather prevalence calibration, which remains the study&#x2019;s primary, successful objective.</p><p>Self-harm, like many sensitive mental health conditions, is frequently underreported in EHRs due to confidentiality concerns, stigma, limited help-seeking, barriers to care, and inconsistent screening and documentation practices across health care settings [<xref ref-type="bibr" rid="ref38">38</xref>]. Additionally, self-harm is likely underdocumented in <italic>ICD</italic> codes in VHA records because VHA facilities receive federal funding on a per-enrolled-patient basis rather than through per-service billing, reducing incentives for exhaustive diagnostic coding [<xref ref-type="bibr" rid="ref39">39</xref>]. It should be noted that VHA suicide risk surveillance does not rely solely on <italic>ICD</italic> codes. Policy and standardized templates such as SBOR are used to systematically document suicidal behaviors and overdoses [<xref ref-type="bibr" rid="ref40">40</xref>], meaning that <italic>ICD</italic> undercoding does not present a complete picture of VHA&#x2019;s operational monitoring of suicide risk. This underreporting hinders accurate prevalence estimation, risk identification, resource allocation, risk modeling, and intervention design [<xref ref-type="bibr" rid="ref12">12</xref>,<xref ref-type="bibr" rid="ref13">13</xref>,<xref ref-type="bibr" rid="ref41">41</xref>,<xref ref-type="bibr" rid="ref42">42</xref>]. To address these gaps, we applied PULSNAR to estimate the proportion of Veterans with both coded and uncoded self-harm and to identify likely cases at scale, serving as a case study for broader phenotyping of undercoded mental illnesses.</p><p>Veterans&#x2019; health records often contain hundreds of thousands of lines of clinical notes, making it challenging and time-consuming for clinicians to manually identify individuals at risk of self-harm, suicidality, or other mental health phenotypes through chart review [<xref ref-type="bibr" rid="ref43">43</xref>,<xref ref-type="bibr" rid="ref44">44</xref>]. More broadly, the self-directed violence nomenclature itself is often confusing to frontline clinicians, and the expectation of uniform, intent-based coding is difficult to meet in routine care [<xref ref-type="bibr" rid="ref45">45</xref>]. Because injury and poisoning codes must encode intent, incomplete or ambiguous documentation pushes events toward &#x201C;accidental&#x201D; categories, hindering differentiation between nonsuicidal self-harm and suicidal self-directed violence [<xref ref-type="bibr" rid="ref46">46</xref>]. This is a fundamental limitation of current surveillance approaches across health systems, and it constrains the accuracy of both administrative data and research phenotypes. Further, while one might hope that a patient&#x2019;s problem list would contain a history of self-harm if it was ever noted, we found that only 22.6% (5556/24,625) of patients with coded self-harm ever had self-harm or self-harm history recorded in their VHA problem list. Given the high workload and time constraints faced by health care providers, thoroughly reviewing lengthy patient records can significantly slow the assessment process, potentially delaying risk assessment and timely intervention [<xref ref-type="bibr" rid="ref47">47</xref>]. Another structural barrier is the absence of required, standardized training for VHA or non-VHA clinicians on self-directed violence nomenclature and the use of structured diagnostic codes for suicide risk surveillance. In practice, documentation and coding are shaped by local culture, individual comfort, and time constraints rather than uniform training, which contributes to inconsistent recognition, labeling, and coding of self-harm and suicidal behaviors across settings [<xref ref-type="bibr" rid="ref48">48</xref>,<xref ref-type="bibr" rid="ref49">49</xref>]. The PULSNAR method addresses this limitation by efficiently estimating the probability of self-harm through analysis of patient data, thereby assisting clinicians in prioritizing high-risk individuals for further evaluation.</p><p>Our expert chart review of 97 unlabeled Veterans yielded 3 key insights relevant to validating algorithmic phenotypes. First, probability calibration is essential and can be achieved with limited chart review. Applying post hoc calibration to the full cohort resulted in a more conservative estimate of self-harm prevalence at 7.91%, in contrast to PULSNAR&#x2019;s estimate of 10.46%. Both estimates, however, are consistent with ranges reported in prior studies of Veteran populations, supporting the plausibility of our findings [<xref ref-type="bibr" rid="ref50">50</xref>-<xref ref-type="bibr" rid="ref52">52</xref>]. Still, here we privilege the calibrated estimate tied to observed human review.</p><p>Second, the main human-human discrepancies were both definitional and false negatives from information overload. The largest contributor to divergence was the treatment of &#x201C;punching objects&#x201D; (eg, walls/doors) without stated self-harm intent. One reviewer initially counted many such episodes as nonsuicidal self-injury (NSSI) (preconsensus count 45), whereas other reviewers generally did not&#x2014;applying an intent or foreseeability standard. Borderline cases (eg, punching a window with tendon laceration) were debated under a reasonable foreseeability framework, absent self-harm intent. Similarly, statements about long-horizon self-destruction (eg, &#x201C;drinking myself to death&#x201D;) were generally classified as substance use disorder unless the episode reflected acute, explicit self-harm intent (eg, deliberate overdose). Under an inclusive NSSI sensitivity definition that includes &#x201C;punching objects&#x201D; [<xref ref-type="bibr" rid="ref53">53</xref>,<xref ref-type="bibr" rid="ref54">54</xref>], the consensus chart review identified 39 positives. This is 9 fewer than PULSNAR&#x2019;s estimate of 48, which is expected and desirable, as PULSNAR is designed to provide an upper bound on the proportion of positives among unlabeled cases. Notably, 28.2% of positives had at least one reviewer missing chart evidence&#x2014;a real concern, as physicians have limited time to comb through notes, unlike our reviewers who used systematic search tools and spent more time per chart than a typical visit duration.</p><p>Third, information asymmetries between data modalities, for example, notes versus codes, explain much of the remaining gap [<xref ref-type="bibr" rid="ref55">55</xref>]. For example, manual reviewers who examined unstructured clinical notes identified several cases of self-harm history (typically past suicide attempts prior to Veteran enrollment) that were documented only in narrative notes but not captured in structured data fields. In contrast, PULSNAR relied exclusively on structured data and would have missed these cases. Conversely, some PULSNAR &#x201C;high-risk&#x201D; cases lacked explicit self-harm documentation in notes but exhibited risk constellations in coded data (injuries/poisonings, major mental illness, and substance use disorders). This pattern is expected given the information asymmetry between notes and codes, reinforcing that neither source is complete [<xref ref-type="bibr" rid="ref55">55</xref>,<xref ref-type="bibr" rid="ref56">56</xref>].</p></sec><sec id="s4-2"><title>Contribution</title><p>We emphasize two contributions. (1) Epidemiologic impact: After calibration, PULSNAR yields a population-scale estimate of ever self-harm that materially exceeds code-based prevalence. Notably, prior Veteran studies report ever NSSI between ~6% and 16%, including samples with rates of 14%&#x2010;16% [<xref ref-type="bibr" rid="ref50">50</xref>-<xref ref-type="bibr" rid="ref52">52</xref>], providing an external range against which our calibrated estimate can be interpreted. (2) Operational utility: PULSNAR enables triage of &#x003E;1.3 million records, focusing human effort (and/or targeted NLP) where the marginal value of chart review is highest. This offers a blueprint for identifying other undercoded mental health diagnoses. In our experience, expert review of 97 charts required substantial person-hours, whereas model inference over the full cohort is computationally tractable; thus, even moderate accuracy can generate large efficiency gains by shrinking the manual search space.</p><p>Our findings support a hybrid workflow generalizable to other undetected conditions: (1) run PULSNAR on structured data to estimate the probability of being a positive case of self-harm (or other target phenotype) for uncoded individuals, (2) select a limited number of uncoded individuals from each probability bin of PULSNAR-estimated probabilities, (3) perform chart review for those selected uncoded individuals, and (4) apply post hoc calibration using chart-reviewed labels and PULSNAR-estimated probabilities of uncoded individuals to obtain calibrated cohort estimates with uncertainty. These steps, collectively, convert a complex validation into actionable epidemiology and a scalable detection pathway for underdocumented self-harm or other mental illnesses.</p></sec><sec id="s4-3"><title>Limitations</title><p>First, this study used a single data source, VHA EHRs, whose patient population, coding practices, and care patterns differ systematically from those of other US and international health systems. Accordingly, the generalizability of our findings to Veterans treated exclusively outside the VHA or to non-Veteran populations needs validation. Second, since the true <italic>&#x03B1;</italic> is generally unidentifiable [<xref ref-type="bibr" rid="ref57">57</xref>], PULSNAR estimates an upper bound on <italic>&#x03B1;</italic> across different positive subtypes. Thus, the corresponding predicted probabilities may be overestimated. Third, the prevalence estimate relies on the assumption that the post hoc calibration factor derived from a small, stratified sample of 97 chart-reviewed cases accurately transfers to the entire unlabeled population of 1.3 million Veterans, a potential source of error we partially addressed with bootstrap resampling to quantify uncertainty. Finally, while we validated this method using chart review for self-harm, which is often documented in notes but uncoded, this approach cannot confirm cases absent from both notes and codes. Furthermore, other conditions may follow different recording mechanisms; for instance, posttraumatic stress disorder in the VHA is typically coded if documented due to disability benefit incentives. Consequently, extending this framework to such diagnoses may require validating against the incidence of future coded diagnoses rather than concurrent notes. Because EHR data do not contain reliable true negative labels for uncoded self-harm, conventional supervised classification metrics such as AUC-ROC, precision-recall, and <italic>F</italic><sub>1</sub>-scores cannot be unbiasedly estimated [<xref ref-type="bibr" rid="ref58">58</xref>]. This limitation is common in PU learning settings where only confirmed positives are available [<xref ref-type="bibr" rid="ref24">24</xref>]. The chart-reviewed subset comprised only previously unlabeled cases and was not selected as a representative validation cohort with confirmed positives and negatives. Accordingly, model evaluation focused on prevalence estimation and agreement with expert review rather than traditional supervised classification metrics.</p><p>Finally, our findings should be interpreted in the context of broader limitations of suicide risk surveillance. VHA policy and clinical operations prioritize suicidal self-directed violence within the past 12 months and rely on policy-driven documentation tools such as the SBOR [<xref ref-type="bibr" rid="ref40">40</xref>,<xref ref-type="bibr" rid="ref59">59</xref>], whereas our phenotyping targets ever self-harm based primarily on <italic>ICD</italic> coding. Self-directed violence nomenclature and intent-based coding rules are complex [<xref ref-type="bibr" rid="ref60">60</xref>], and there is no required, standardized training for VHA or non-VHA clinicians on their application [<xref ref-type="bibr" rid="ref61">61</xref>]; as a result, documentation and coding of self-harm and suicidal behavior are inconsistent across settings and clinical providers. These surveillance and training constraints likely contribute to undercoding and misclassification and should be considered when interpreting our prevalence estimates and model performance.</p></sec><sec id="s4-4"><title>Future Directions</title><p>Future research could validate our PU learning algorithms in other populations and health care settings and across a broader range of mental health diagnoses. Incorporating unstructured data, such as clinical notes, through NLP techniques, may further enhance the detection of uncoded self-harm instances and develop valid and reliable scales to measure self-harm. Additionally, integrating our approach into clinical workflows could facilitate real-time identification of at-risk individuals, enabling timely intervention.</p></sec><sec id="s4-5"><title>Conclusions</title><p>Our study demonstrates the effectiveness of PU learning algorithms under the SNAR assumption in identifying uncoded instances of self-harm among US Veterans. PULSNAR can support both population-level prevalence estimation and individual-level risk stratification using structured health data, although differences between structured billing codes and clinical notes may affect concordance with manual chart reviews. Our findings reveal a significantly higher prevalence of self-harm than what is captured in diagnostic codes, emphasizing the urgent need for more accurate detection and imputation methods. Our approach offers a scalable and efficient adjunct to manual chart reviews for detecting undetected mental illness diagnoses, with the potential to enhance clinical practice, inform policy decisions, support comparative effectiveness studies with imputed phenotypes [<xref ref-type="bibr" rid="ref13">13</xref>], improve predictive modeling of self-harm and other conditions, and ultimately contribute to reducing suicide rates among Veterans and improving mental health. We encourage the adoption of similar methods in other health care systems to address undercoding challenges, improve patient outcomes, and advance the application of ML in health care analytics.</p></sec></sec></body><back><ack><p>We gratefully acknowledge Edgar J Villarreal, PhD, and his team from the VA Office of Suicide Prevention for their invaluable guidance on the nuances of Veterans Health Administration policy and operational practices, which substantially strengthened the contextual clarity of this work. Generative artificial intelligence tools were not used at any stage of this study, including study design, methodology development, literature review, code generation, or manuscript writing and editing. However, Grammarly was used as a spell and grammar checker to improve the clarity of the manuscript.</p></ack><notes><sec><title>Funding</title><p>This research was supported by funding from the US National Institutes of Health, specifically, the National Institute of Mental Health grant R01MH129764, the National Library of Medicine grant R00LM013367, and infrastructure support from the National Center for Advancing Translational Sciences grants UL1TR001449 and UM1TR005466. The views expressed in this paper are those of the authors and do not necessarily reflect those of the National Institutes of Health. Access to VA data and computational environments provided by the Department of Veterans Affairs (VA) Informatics and Computing Infrastructure (VINCI), funded under the research priority to Put VA Data to Work for Veterans (VA ORD 22-D4V).</p></sec></notes><fn-group><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">AUC</term><def><p>area under the curve</p></def></def-item><def-item><term id="abb2">CSR</term><def><p>compressed sparse row</p></def></def-item><def-item><term id="abb3">ED</term><def><p>emergency department</p></def></def-item><def-item><term id="abb4">EHR</term><def><p>electronic health record</p></def></def-item><def-item><term id="abb5"><italic>ICD</italic></term><def><p><italic>International Classification of Diseases</italic></p></def></def-item><def-item><term id="abb6">LASSO</term><def><p>least absolute shrinkage and selection operator</p></def></def-item><def-item><term id="abb7">ML</term><def><p>machine learning</p></def></def-item><def-item><term id="abb8">NLP</term><def><p>natural language processing</p></def></def-item><def-item><term id="abb9">NSSI</term><def><p>nonsuicidal self-injury</p></def></def-item><def-item><term id="abb10">OMOP</term><def><p>Observational Medical Outcomes Partnership</p></def></def-item><def-item><term id="abb11">PU</term><def><p>positive and unlabeled</p></def></def-item><def-item><term id="abb12">PULSCAR</term><def><p>positive unlabeled learning selected completely at random</p></def></def-item><def-item><term id="abb13">PULSNAR</term><def><p>positive unlabeled learning selected not at random</p></def></def-item><def-item><term id="abb14">SBOR</term><def><p>Suicide Behavior and Overdose Report</p></def></def-item><def-item><term id="abb15">SCAR</term><def><p>selected completely at random</p></def></def-item><def-item><term id="abb16">SNAR</term><def><p>selected not at random</p></def></def-item><def-item><term id="abb17">SNOMED</term><def><p>Systematized Nomenclature of Medicine</p></def></def-item><def-item><term id="abb18">VHA</term><def><p>Veterans Health Administration</p></def></def-item><def-item><term id="abb19">XGBoost</term><def><p>Extreme Gradient Boosting</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="web"><article-title>Facts about suicide</article-title><source>Centers for Disease Control and Prevention</source><year>2025</year><month>03</month><day>26</day><access-date>2026-05-09</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.cdc.gov/suicide/facts/index.html">https://www.cdc.gov/suicide/facts/index.html</ext-link></comment></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="web"><article-title>10 leading causes of death, United States, 2024</article-title><source>Centers for Disease Control and Prevention</source><access-date>2026-05-09</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://wisqars.cdc.gov/lcd/">https://wisqars.cdc.gov/lcd/</ext-link></comment></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="report"><article-title>2022 National Survey on Drug Use and Health (NSDUH) Annual National Report</article-title><access-date>2026-05-09</access-date><publisher-name>Substance Abuse and Mental Health Services Administration</publisher-name><comment><ext-link ext-link-type="uri" xlink:href="https://www.samhsa.gov/data/report/2023-nsduh-annual-national-report">https://www.samhsa.gov/data/report/2023-nsduh-annual-national-report</ext-link></comment></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ruiz</surname><given-names>F</given-names> </name><name name-style="western"><surname>Burgo-Black</surname><given-names>L</given-names> </name><name name-style="western"><surname>Hunt</surname><given-names>SC</given-names> </name><name name-style="western"><surname>Miller</surname><given-names>M</given-names> </name><name name-style="western"><surname>Spelman</surname><given-names>JF</given-names> </name></person-group><article-title>A practical review of suicide among veterans: preventive and proactive measures for health care institutions and providers</article-title><source>Public Health Rep</source><year>2023</year><volume>138</volume><issue>2</issue><fpage>223</fpage><lpage>231</lpage><pub-id pub-id-type="doi">10.1177/00333549221085240</pub-id><pub-id pub-id-type="medline">35403486</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="report"><article-title>2024 National Veteran Suicide Prevention Annual Report</article-title><year>2024</year><month>12</month><access-date>2026-05-09</access-date><publisher-name>U.S. Department of Veterans Affairs</publisher-name><comment><ext-link ext-link-type="uri" xlink:href="https://www.mentalhealth.va.gov/docs/data-sheets/2024/2024-Annual-Report-Part-2-of-2_508.pdf">https://www.mentalhealth.va.gov/docs/data-sheets/2024/2024-Annual-Report-Part-2-of-2_508.pdf</ext-link></comment></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Schafer</surname><given-names>KM</given-names> </name><name name-style="western"><surname>Duffy</surname><given-names>M</given-names> </name><name name-style="western"><surname>Kennedy</surname><given-names>G</given-names> </name><etal/></person-group><article-title>Suicidal ideation, suicide attempts, and suicide death among veterans and service members: a comprehensive meta-analysis of risk factors</article-title><source>Mil Psychol</source><year>2022</year><volume>34</volume><issue>2</issue><fpage>129</fpage><lpage>146</lpage><pub-id pub-id-type="doi">10.1080/08995605.2021.1976544</pub-id><pub-id pub-id-type="medline">38536290</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ribeiro</surname><given-names>JD</given-names> </name><name name-style="western"><surname>Franklin</surname><given-names>JC</given-names> </name><name name-style="western"><surname>Fox</surname><given-names>KR</given-names> </name><etal/></person-group><article-title>Self-injurious thoughts and behaviors as risk factors for future suicide ideation, attempts, and death: a meta-analysis of longitudinal studies</article-title><source>Psychol Med</source><year>2016</year><month>01</month><volume>46</volume><issue>2</issue><fpage>225</fpage><lpage>236</lpage><pub-id pub-id-type="doi">10.1017/S0033291715001804</pub-id><pub-id pub-id-type="medline">26370729</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Predescu</surname><given-names>E</given-names> </name><name name-style="western"><surname>Sipos</surname><given-names>R</given-names> </name></person-group><article-title>Self-harm behaviors, suicide attempts, and suicidal ideation in a clinical sample of children and adolescents with psychiatric disorders</article-title><source>Children (Basel)</source><year>2023</year><month>04</month><day>14</day><volume>10</volume><issue>4</issue><fpage>725</fpage><pub-id pub-id-type="doi">10.3390/children10040725</pub-id><pub-id pub-id-type="medline">37189974</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Banda</surname><given-names>JM</given-names> </name><name name-style="western"><surname>Seneviratne</surname><given-names>M</given-names> </name><name name-style="western"><surname>Hernandez-Boussard</surname><given-names>T</given-names> </name><name name-style="western"><surname>Shah</surname><given-names>NH</given-names> </name></person-group><article-title>Advances in electronic phenotyping: from rule-based definitions to machine learning models</article-title><source>Annu Rev Biomed Data Sci</source><year>2018</year><month>07</month><volume>1</volume><fpage>53</fpage><lpage>68</lpage><pub-id pub-id-type="doi">10.1146/annurev-biodatasci-080917-013315</pub-id><pub-id pub-id-type="medline">31218278</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Pathak</surname><given-names>J</given-names> </name><name name-style="western"><surname>Kho</surname><given-names>AN</given-names> </name><name name-style="western"><surname>Denny</surname><given-names>JC</given-names> </name></person-group><article-title>Electronic health records-driven phenotyping: challenges, recent advances, and perspectives</article-title><source>J Am Med Inform Assoc</source><year>2013</year><month>12</month><volume>20</volume><issue>e2</issue><fpage>e206</fpage><lpage>11</lpage><pub-id pub-id-type="doi">10.1136/amiajnl-2013-002428</pub-id><pub-id pub-id-type="medline">24302669</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Xiao</surname><given-names>C</given-names> </name><name name-style="western"><surname>Choi</surname><given-names>E</given-names> </name><name name-style="western"><surname>Sun</surname><given-names>J</given-names> </name></person-group><article-title>Opportunities and challenges in developing deep learning models using electronic health records data: a systematic review</article-title><source>J Am Med Inform Assoc</source><year>2018</year><month>10</month><day>1</day><volume>25</volume><issue>10</issue><fpage>1419</fpage><lpage>1428</lpage><pub-id pub-id-type="doi">10.1093/jamia/ocy068</pub-id><pub-id pub-id-type="medline">29893864</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kumar</surname><given-names>P</given-names> </name><name name-style="western"><surname>Nestsiarovich</surname><given-names>A</given-names> </name><name name-style="western"><surname>Nelson</surname><given-names>SJ</given-names> </name><name name-style="western"><surname>Kerner</surname><given-names>B</given-names> </name><name name-style="western"><surname>Perkins</surname><given-names>DJ</given-names> </name><name name-style="western"><surname>Lambert</surname><given-names>CG</given-names> </name></person-group><article-title>Imputation and characterization of uncoded self-harm in major mental illness using machine learning</article-title><source>J Am Med Inform Assoc</source><year>2020</year><month>01</month><day>1</day><volume>27</volume><issue>1</issue><fpage>136</fpage><lpage>146</lpage><pub-id pub-id-type="doi">10.1093/jamia/ocz173</pub-id><pub-id pub-id-type="medline">31651956</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Nestsiarovich</surname><given-names>A</given-names> </name><name name-style="western"><surname>Kumar</surname><given-names>P</given-names> </name><name name-style="western"><surname>Lauve</surname><given-names>NR</given-names> </name><etal/></person-group><article-title>Using machine learning imputed outcomes to assess drug-dependent risk of self-harm in patients with bipolar disorder: a comparative effectiveness study</article-title><source>JMIR Ment Health</source><year>2021</year><month>04</month><day>21</day><volume>8</volume><issue>4</issue><fpage>e24522</fpage><pub-id pub-id-type="doi">10.2196/24522</pub-id><pub-id pub-id-type="medline">33688834</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hedegaard</surname><given-names>H</given-names> </name><name name-style="western"><surname>Schoenbaum</surname><given-names>M</given-names> </name><name name-style="western"><surname>Claassen</surname><given-names>C</given-names> </name><name name-style="western"><surname>Crosby</surname><given-names>A</given-names> </name><name name-style="western"><surname>Holland</surname><given-names>K</given-names> </name><name name-style="western"><surname>Proescholdbell</surname><given-names>S</given-names> </name></person-group><article-title>Issues in developing a surveillance case definition for nonfatal suicide attempt and intentional self-harm using International Classification of Diseases, Tenth Revision, Clinical Modification (ICD-10-CM) coded data</article-title><source>Natl Health Stat Report</source><year>2018</year><month>02</month><issue>108</issue><fpage>1</fpage><lpage>19</lpage><pub-id pub-id-type="medline">29616901</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Edgcomb</surname><given-names>JB</given-names> </name><name name-style="western"><surname>Thiruvalluru</surname><given-names>R</given-names> </name><name name-style="western"><surname>Pathak</surname><given-names>J</given-names> </name><name name-style="western"><surname>Brooks</surname><given-names>JO</given-names> </name></person-group><article-title>Machine learning to differentiate risk of suicide attempt and self-harm after general medical hospitalization of women with mental illness</article-title><source>Med Care</source><year>2021</year><month>02</month><day>1</day><volume>59</volume><fpage>S58</fpage><lpage>S64</lpage><pub-id pub-id-type="doi">10.1097/MLR.0000000000001467</pub-id><pub-id pub-id-type="medline">33438884</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Simon</surname><given-names>GE</given-names> </name><name name-style="western"><surname>Shortreed</surname><given-names>SM</given-names> </name><name name-style="western"><surname>Johnson</surname><given-names>E</given-names> </name><etal/></person-group><article-title>Predicting risk of suicidal behavior from insurance claims data vs. linked data from insurance claims and electronic health records</article-title><source>Pharmacoepidemiol Drug Saf</source><year>2024</year><month>01</month><volume>33</volume><issue>1</issue><fpage>e5734</fpage><pub-id pub-id-type="doi">10.1002/pds.5734</pub-id><pub-id pub-id-type="medline">38112287</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Simon</surname><given-names>GE</given-names> </name><name name-style="western"><surname>Johnson</surname><given-names>E</given-names> </name><name name-style="western"><surname>Lawrence</surname><given-names>JM</given-names> </name><etal/></person-group><article-title>Predicting suicide attempts and suicide deaths following outpatient visits using electronic health records</article-title><source>Am J Psychiatry</source><year>2018</year><month>10</month><day>1</day><volume>175</volume><issue>10</issue><fpage>951</fpage><lpage>960</lpage><pub-id pub-id-type="doi">10.1176/appi.ajp.2018.17101167</pub-id><pub-id pub-id-type="medline">29792051</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Rozova</surname><given-names>V</given-names> </name><name name-style="western"><surname>Witt</surname><given-names>K</given-names> </name><name name-style="western"><surname>Robinson</surname><given-names>J</given-names> </name><name name-style="western"><surname>Li</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Verspoor</surname><given-names>K</given-names> </name></person-group><article-title>Detection of self-harm and suicidal ideation in emergency department triage notes</article-title><source>J Am Med Inform Assoc</source><year>2022</year><month>01</month><day>29</day><volume>29</volume><issue>3</issue><fpage>472</fpage><lpage>480</lpage><pub-id pub-id-type="doi">10.1093/jamia/ocab261</pub-id><pub-id pub-id-type="medline">34897466</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Walsh</surname><given-names>CG</given-names> </name><name name-style="western"><surname>Ribeiro</surname><given-names>JD</given-names> </name><name name-style="western"><surname>Franklin</surname><given-names>JC</given-names> </name></person-group><article-title>Predicting suicide attempts in adolescents with longitudinal clinical data and machine learning</article-title><source>J Child Psychol Psychiatry</source><year>2018</year><month>12</month><volume>59</volume><issue>12</issue><fpage>1261</fpage><lpage>1270</lpage><pub-id pub-id-type="doi">10.1111/jcpp.12916</pub-id><pub-id pub-id-type="medline">29709069</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Tsui</surname><given-names>FR</given-names> </name><name name-style="western"><surname>Shi</surname><given-names>L</given-names> </name><name name-style="western"><surname>Ruiz</surname><given-names>V</given-names> </name><etal/></person-group><article-title>Natural language processing and machine learning of electronic health records for prediction of first-time suicide attempts</article-title><source>JAMIA Open</source><year>2021</year><month>01</month><volume>4</volume><issue>1</issue><fpage>ooab011</fpage><pub-id pub-id-type="doi">10.1093/jamiaopen/ooab011</pub-id><pub-id pub-id-type="medline">33758800</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Su</surname><given-names>C</given-names> </name><name name-style="western"><surname>Aseltine</surname><given-names>R</given-names> </name><name name-style="western"><surname>Doshi</surname><given-names>R</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>K</given-names> </name><name name-style="western"><surname>Rogers</surname><given-names>SC</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>F</given-names> </name></person-group><article-title>Machine learning for suicide risk prediction in children and adolescents with electronic health records</article-title><source>Transl Psychiatry</source><year>2020</year><month>11</month><day>26</day><volume>10</volume><issue>1</issue><fpage>413</fpage><pub-id pub-id-type="doi">10.1038/s41398-020-01100-0</pub-id><pub-id pub-id-type="medline">33243979</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Barak-Corren</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Castro</surname><given-names>VM</given-names> </name><name name-style="western"><surname>Javitt</surname><given-names>S</given-names> </name><etal/></person-group><article-title>Predicting suicidal behavior from longitudinal electronic health records</article-title><source>Am J Psychiatry</source><year>2017</year><month>02</month><day>1</day><volume>174</volume><issue>2</issue><fpage>154</fpage><lpage>162</lpage><pub-id pub-id-type="doi">10.1176/appi.ajp.2016.16010077</pub-id><pub-id pub-id-type="medline">27609239</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Chen</surname><given-names>T</given-names> </name><name name-style="western"><surname>Guestrin</surname><given-names>C</given-names> </name></person-group><article-title>XGBoost: a scalable tree boosting system</article-title><source>Proceedings of the 22nd ACM SIGKDD International Conference on Knowledge Discovery and Data Mining</source><year>2016</year><publisher-name>Association for Computing Machinery</publisher-name><fpage>785</fpage><lpage>794</lpage><pub-id pub-id-type="doi">10.1145/2939672.2939785</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kumar</surname><given-names>P</given-names> </name><name name-style="western"><surname>Lambert</surname><given-names>CG</given-names> </name></person-group><article-title>Positive unlabeled learning selected not at random (PULSNAR): class proportion estimation without the selected completely at random assumption</article-title><source>PeerJ Comput Sci</source><year>2024</year><volume>10</volume><fpage>e2451</fpage><pub-id pub-id-type="doi">10.7717/peerj-cs.2451</pub-id><pub-id pub-id-type="medline">39650456</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Jaskie</surname><given-names>K</given-names> </name><name name-style="western"><surname>Elkan</surname><given-names>C</given-names> </name><name name-style="western"><surname>Spanias</surname><given-names>A</given-names> </name></person-group><article-title>A modified logistic regression for positive and unlabeled learning</article-title><source>2019 53rd Asilomar Conference on Signals, Systems, and Computers</source><year>2019</year><publisher-name>IEEE</publisher-name><pub-id pub-id-type="doi">10.1109/IEEECONF44664.2019.9048765</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Elkan</surname><given-names>C</given-names> </name><name name-style="western"><surname>Noto</surname><given-names>K</given-names> </name></person-group><article-title>Learning classifiers from only positive and unlabeled data</article-title><source>Proceedings of the 14th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining</source><year>2008</year><publisher-name>Association for Computing Machinery</publisher-name><fpage>213</fpage><lpage>220</lpage><pub-id pub-id-type="doi">10.1145/1401890.1401920</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Du Plessis</surname><given-names>MC</given-names> </name><name name-style="western"><surname>Sugiyama</surname><given-names>M</given-names> </name></person-group><article-title>Class prior estimation from positive and unlabeled data</article-title><source>IEICE Trans Inf Syst</source><year>2014</year><volume>E97.D</volume><issue>5</issue><fpage>1358</fpage><lpage>1362</lpage><pub-id pub-id-type="doi">10.1587/transinf.E97.D.1358</pub-id></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Ramaswamy</surname><given-names>H</given-names> </name><name name-style="western"><surname>Scott</surname><given-names>C</given-names> </name><name name-style="western"><surname>Tewari</surname><given-names>A</given-names> </name></person-group><article-title>Mixture proportion estimation via kernel embeddings of distributions</article-title><source>Proceedings of the 33rd International Conference on Machine Learning</source><year>2016</year><publisher-name>JMLR</publisher-name><fpage>2052</fpage><lpage>2060</lpage><pub-id pub-id-type="doi">10.5555/3045390.3045607</pub-id></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bekker</surname><given-names>J</given-names> </name><name name-style="western"><surname>Davis</surname><given-names>J</given-names> </name></person-group><article-title>Estimating the class prior in positive and unlabeled data through decision tree induction</article-title><source>Proc AAAI Conf Artif Intell</source><year>2018</year><volume>32</volume><issue>1</issue><pub-id pub-id-type="doi">10.1609/aaai.v32i1.11715</pub-id></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Ivanov</surname><given-names>D</given-names> </name></person-group><article-title>DEDPUL: difference-of-estimated-densities-based positive-unlabeled learning</article-title><source>2020 19th IEEE International Conference on Machine Learning and Applications (ICMLA)</source><year>2020</year><publisher-name>IEEE</publisher-name><fpage>782</fpage><lpage>790</lpage><pub-id pub-id-type="doi">10.1109/ICMLA51294.2020.00128</pub-id></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Gerych</surname><given-names>W</given-names> </name><name name-style="western"><surname>Hartvigsen</surname><given-names>T</given-names> </name><name name-style="western"><surname>Buquicchio</surname><given-names>L</given-names> </name><name name-style="western"><surname>Agu</surname><given-names>E</given-names> </name><name name-style="western"><surname>Rundensteiner</surname><given-names>E</given-names> </name></person-group><article-title>Recovering the propensity score from biased positive unlabeled data</article-title><source>Proc AAAI Conf Artifi Intell</source><year>2022</year><volume>36</volume><issue>6</issue><fpage>6694</fpage><lpage>6702</lpage><pub-id pub-id-type="doi">10.1609/aaai.v36i6.20624</pub-id></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Bekker</surname><given-names>J</given-names> </name><name name-style="western"><surname>Robberechts</surname><given-names>P</given-names> </name><name name-style="western"><surname>Davis</surname><given-names>J</given-names> </name></person-group><article-title>Beyond the selected completely at random assumption for learning from positive and unlabeled data</article-title><source>Machine Learning and Knowledge Discovery in Databases</source><year>2020</year><publisher-name>Springer</publisher-name><fpage>71</fpage><lpage>85</lpage><pub-id pub-id-type="doi">10.1007/978-3-030-46147-8_5</pub-id></nlm-citation></ref><ref id="ref33"><label>33</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kumar</surname><given-names>P</given-names> </name><name name-style="western"><surname>Moomtaheen</surname><given-names>F</given-names> </name><name name-style="western"><surname>Malec</surname><given-names>SA</given-names> </name><etal/></person-group><article-title>Detecting opioid use disorder in health claims data with positive unlabeled learning</article-title><source>IEEE J Biomed Health Inform</source><year>2025</year><month>02</month><volume>29</volume><issue>2</issue><fpage>750</fpage><lpage>757</lpage><pub-id pub-id-type="doi">10.1109/JBHI.2024.3515805</pub-id><pub-id pub-id-type="medline">40030473</pub-id></nlm-citation></ref><ref id="ref34"><label>34</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Pozzolo</surname><given-names>AD</given-names> </name><name name-style="western"><surname>Caelen</surname><given-names>O</given-names> </name><name name-style="western"><surname>Johnson</surname><given-names>RA</given-names> </name><name name-style="western"><surname>Bontempi</surname><given-names>G</given-names> </name></person-group><article-title>Calibrating probability with undersampling for unbalanced classification</article-title><source>2015 IEEE Symposium Series on Computational Intelligence (SSCI)</source><year>2015</year><publisher-name>IEEE</publisher-name><fpage>159</fpage><lpage>166</lpage><pub-id pub-id-type="doi">10.1109/SSCI.2015.33</pub-id></nlm-citation></ref><ref id="ref35"><label>35</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Philip</surname><given-names>K</given-names> </name><name name-style="western"><surname>Chan</surname><given-names>SJ</given-names> </name></person-group><article-title>Toward scalable learning with non-uniform class and cost distributions: a case study in credit card fraud detection</article-title><source>Proceedings of the Fourth International Conference on Knowledge Discovery and Data Mining</source><year>1998</year><publisher-name>AAAI Press</publisher-name><fpage>164</fpage><lpage>168</lpage><pub-id pub-id-type="doi">10.5555/3000292.3000320</pub-id></nlm-citation></ref><ref id="ref36"><label>36</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Voss</surname><given-names>EA</given-names> </name><name name-style="western"><surname>Makadia</surname><given-names>R</given-names> </name><name name-style="western"><surname>Matcho</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Feasibility and utility of applications of the common data model to multiple, disparate observational health databases</article-title><source>J Am Med Inform Assoc</source><year>2015</year><month>05</month><volume>22</volume><issue>3</issue><fpage>553</fpage><lpage>564</lpage><pub-id pub-id-type="doi">10.1093/jamia/ocu023</pub-id><pub-id pub-id-type="medline">25670757</pub-id></nlm-citation></ref><ref id="ref37"><label>37</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lynch</surname><given-names>KE</given-names> </name><name name-style="western"><surname>Deppen</surname><given-names>SA</given-names> </name><name name-style="western"><surname>DuVall</surname><given-names>SL</given-names> </name><etal/></person-group><article-title>Incrementally transforming electronic medical records into the observational medical outcomes partnership common data model: a multidimensional quality assurance approach</article-title><source>Appl Clin Inform</source><year>2019</year><month>10</month><volume>10</volume><issue>5</issue><fpage>794</fpage><lpage>803</lpage><pub-id pub-id-type="doi">10.1055/s-0039-1697598</pub-id><pub-id pub-id-type="medline">31645076</pub-id></nlm-citation></ref><ref id="ref38"><label>38</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Waller</surname><given-names>G</given-names> </name><name name-style="western"><surname>Newbury-Birch</surname><given-names>D</given-names> </name><name name-style="western"><surname>Simpson</surname><given-names>D</given-names> </name><etal/></person-group><article-title>The barriers and facilitators to the reporting and recording of self-harm in young people aged 18 and under: a systematic review</article-title><source>BMC Public Health</source><year>2023</year><month>01</month><day>24</day><volume>23</volume><issue>1</issue><fpage>158</fpage><pub-id pub-id-type="doi">10.1186/s12889-023-15046-7</pub-id><pub-id pub-id-type="medline">36694149</pub-id></nlm-citation></ref><ref id="ref39"><label>39</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Dizon</surname><given-names>MP</given-names> </name><name name-style="western"><surname>Chow</surname><given-names>A</given-names> </name><name name-style="western"><surname>Ong</surname><given-names>MK</given-names> </name><etal/></person-group><article-title>Lower comorbidity scores and severity levels in Veterans Health Administration hospitals: a cross-sectional study</article-title><source>BMC Health Serv Res</source><year>2024</year><month>05</month><day>8</day><volume>24</volume><issue>1</issue><fpage>601</fpage><pub-id pub-id-type="doi">10.1186/s12913-024-11063-3</pub-id><pub-id pub-id-type="medline">38714970</pub-id></nlm-citation></ref><ref id="ref40"><label>40</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Gujral</surname><given-names>K</given-names> </name><name name-style="western"><surname>Bahraini</surname><given-names>N</given-names> </name><name name-style="western"><surname>Brenner</surname><given-names>LA</given-names> </name><etal/></person-group><article-title>VA&#x2019;s implementation of universal screening and evaluation for the suicide risk identification program in November 2020 -Implications for Veterans with prior mental health needs</article-title><source>PLoS ONE</source><year>2023</year><volume>18</volume><issue>4</issue><fpage>e0283633</fpage><pub-id pub-id-type="doi">10.1371/journal.pone.0283633</pub-id><pub-id pub-id-type="medline">37040367</pub-id></nlm-citation></ref><ref id="ref41"><label>41</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Meda</surname><given-names>N</given-names> </name><name name-style="western"><surname>Angelozzi</surname><given-names>L</given-names> </name><name name-style="western"><surname>Poletto</surname><given-names>M</given-names> </name><etal/></person-group><article-title>How many people die by suicide each year? Not 727,000: a systematic review and meta-analysis of suicide underreporting across 71 countries over 122 years</article-title><source>Front Psychiatry</source><year>2025</year><volume>16</volume><fpage>1609580</fpage><pub-id pub-id-type="doi">10.3389/fpsyt.2025.1609580</pub-id><pub-id pub-id-type="medline">40873674</pub-id></nlm-citation></ref><ref id="ref42"><label>42</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Getzen</surname><given-names>E</given-names> </name><name name-style="western"><surname>Ungar</surname><given-names>L</given-names> </name><name name-style="western"><surname>Mowery</surname><given-names>D</given-names> </name><name name-style="western"><surname>Jiang</surname><given-names>X</given-names> </name><name name-style="western"><surname>Long</surname><given-names>Q</given-names> </name></person-group><article-title>Mining for equitable health: assessing the impact of missing data in electronic health records</article-title><source>J Biomed Inform</source><year>2023</year><month>03</month><volume>139</volume><fpage>104269</fpage><pub-id pub-id-type="doi">10.1016/j.jbi.2022.104269</pub-id><pub-id pub-id-type="medline">36621750</pub-id></nlm-citation></ref><ref id="ref43"><label>43</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>McKenzie</surname><given-names>J</given-names> </name><name name-style="western"><surname>Rajapakshe</surname><given-names>R</given-names> </name><name name-style="western"><surname>Shen</surname><given-names>H</given-names> </name><name name-style="western"><surname>Rajapakshe</surname><given-names>S</given-names> </name><name name-style="western"><surname>Lin</surname><given-names>A</given-names> </name></person-group><article-title>A semiautomated chart review for assessing the development of radiation pneumonitis using natural language processing: diagnostic accuracy and feasibility study</article-title><source>JMIR Med Inform</source><year>2021</year><month>11</month><day>12</day><volume>9</volume><issue>11</issue><fpage>e29241</fpage><pub-id pub-id-type="doi">10.2196/29241</pub-id><pub-id pub-id-type="medline">34766919</pub-id></nlm-citation></ref><ref id="ref44"><label>44</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Xie</surname><given-names>F</given-names> </name><name name-style="western"><surname>Ling Grant</surname><given-names>DS</given-names> </name><name name-style="western"><surname>Chang</surname><given-names>J</given-names> </name><name name-style="western"><surname>Amundsen</surname><given-names>BI</given-names> </name><name name-style="western"><surname>Hechter</surname><given-names>RC</given-names> </name></person-group><article-title>Identifying suicidal ideation and attempt from clinical notes within a large integrated health care system</article-title><source>Perm J</source><year>2022</year><month>04</month><day>5</day><volume>26</volume><issue>1</issue><fpage>85</fpage><lpage>93</lpage><pub-id pub-id-type="doi">10.7812/TPP/21.102</pub-id><pub-id pub-id-type="medline">35609162</pub-id></nlm-citation></ref><ref id="ref45"><label>45</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Teismann</surname><given-names>T</given-names> </name><name name-style="western"><surname>Eimen</surname><given-names>JJ</given-names> </name><name name-style="western"><surname>Cwik</surname><given-names>JC</given-names> </name></person-group><article-title>Misclassification of self-directed violence</article-title><source>Crisis</source><year>2023</year><month>11</month><volume>44</volume><issue>6</issue><fpage>525</fpage><lpage>528</lpage><pub-id pub-id-type="doi">10.1027/0227-5910/a000897</pub-id><pub-id pub-id-type="medline">36636794</pub-id></nlm-citation></ref><ref id="ref46"><label>46</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Simon</surname><given-names>GE</given-names> </name><name name-style="western"><surname>Shortreed</surname><given-names>SM</given-names> </name><name name-style="western"><surname>Boggs</surname><given-names>JM</given-names> </name><etal/></person-group><article-title>Accuracy of ICD-10-CM encounter diagnoses from health records for identifying self-harm events</article-title><source>J Am Med Inform Assoc</source><year>2022</year><month>11</month><day>14</day><volume>29</volume><issue>12</issue><fpage>2023</fpage><lpage>2031</lpage><pub-id pub-id-type="doi">10.1093/jamia/ocac144</pub-id><pub-id pub-id-type="medline">36018725</pub-id></nlm-citation></ref><ref id="ref47"><label>47</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Arndt</surname><given-names>BG</given-names> </name><name name-style="western"><surname>Micek</surname><given-names>MA</given-names> </name><name name-style="western"><surname>Rule</surname><given-names>A</given-names> </name><name name-style="western"><surname>Shafer</surname><given-names>CM</given-names> </name><name name-style="western"><surname>Baltus</surname><given-names>JJ</given-names> </name><name name-style="western"><surname>Sinsky</surname><given-names>CA</given-names> </name></person-group><article-title>More tethered to the EHR: EHR workload trends among academic primary care physicians, 2019-2023</article-title><source>Ann Fam Med</source><year>2024</year><volume>22</volume><issue>1</issue><fpage>12</fpage><lpage>18</lpage><pub-id pub-id-type="doi">10.1370/afm.3047</pub-id><pub-id pub-id-type="medline">38253499</pub-id></nlm-citation></ref><ref id="ref48"><label>48</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Davis</surname><given-names>M</given-names> </name><name name-style="western"><surname>Siegel</surname><given-names>J</given-names> </name><name name-style="western"><surname>Becker-Haimes</surname><given-names>EM</given-names> </name><etal/></person-group><article-title>Identifying common and unique barriers and facilitators to implementing evidence-based practices for suicide prevention across primary care and specialty mental health settings</article-title><source>Arch Suicide Res</source><year>2023</year><volume>27</volume><issue>2</issue><fpage>192</fpage><lpage>214</lpage><pub-id pub-id-type="doi">10.1080/13811118.2021.1982094</pub-id><pub-id pub-id-type="medline">34651544</pub-id></nlm-citation></ref><ref id="ref49"><label>49</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>LeCloux</surname><given-names>M</given-names> </name><name name-style="western"><surname>Aguinaldo</surname><given-names>LD</given-names> </name><name name-style="western"><surname>Lanzillo</surname><given-names>EC</given-names> </name><name name-style="western"><surname>Horowitz</surname><given-names>LM</given-names> </name></person-group><article-title>PCP opinions of universal suicide risk screening in rural primary care: current challenges and strategies for successful implementation</article-title><source>J Rural Health</source><year>2021</year><month>06</month><volume>37</volume><issue>3</issue><fpage>554</fpage><lpage>564</lpage><pub-id pub-id-type="doi">10.1111/jrh.12508</pub-id><pub-id pub-id-type="medline">32845543</pub-id></nlm-citation></ref><ref id="ref50"><label>50</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Halverson</surname><given-names>TF</given-names> </name><name name-style="western"><surname>Mann</surname><given-names>AJD</given-names> </name><name name-style="western"><surname>Zelkowitz</surname><given-names>RL</given-names> </name><etal/></person-group><article-title>Nonsuicidal self-injury in veterans: prevalence, clinical characteristics, and gender differences from a national cohort</article-title><source>Psychiatry Res</source><year>2022</year><month>09</month><volume>315</volume><fpage>114708</fpage><pub-id pub-id-type="doi">10.1016/j.psychres.2022.114708</pub-id><pub-id pub-id-type="medline">35868073</pub-id></nlm-citation></ref><ref id="ref51"><label>51</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Gromatsky</surname><given-names>M</given-names> </name><name name-style="western"><surname>Halverson</surname><given-names>TF</given-names> </name><name name-style="western"><surname>Dillon</surname><given-names>KH</given-names> </name><etal/></person-group><article-title>The prevalence of nonsuicidal self-injury in military personnel: a systematic review and meta-analysis</article-title><source>Trauma Violence Abuse</source><year>2023</year><month>12</month><volume>24</volume><issue>5</issue><fpage>2936</fpage><lpage>2952</lpage><pub-id pub-id-type="doi">10.1177/15248380221119513</pub-id><pub-id pub-id-type="medline">36062896</pub-id></nlm-citation></ref><ref id="ref52"><label>52</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bryan</surname><given-names>C</given-names> </name><name name-style="western"><surname>Bryan</surname><given-names>A</given-names> </name></person-group><article-title>Nonsuicidal self-injury among a sample of United States military personnel and veterans enrolled in college classes</article-title><source>J Clin Psychol</source><year>2014</year><month>09</month><volume>70</volume><issue>9</issue><fpage>874</fpage><lpage>885</lpage><pub-id pub-id-type="doi">10.1002/jclp.22075</pub-id><pub-id pub-id-type="medline">24619940</pub-id></nlm-citation></ref><ref id="ref53"><label>53</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kimbrel</surname><given-names>NA</given-names> </name><name name-style="western"><surname>Thomas</surname><given-names>SP</given-names> </name><name name-style="western"><surname>Hicks</surname><given-names>TA</given-names> </name><etal/></person-group><article-title>Wall/object punching: an important but under-recognized form of nonsuicidal self-injury</article-title><source>Suicide Life Threat Behav</source><year>2018</year><month>10</month><volume>48</volume><issue>5</issue><fpage>501</fpage><lpage>511</lpage><pub-id pub-id-type="doi">10.1111/sltb.12371</pub-id><pub-id pub-id-type="medline">28925016</pub-id></nlm-citation></ref><ref id="ref54"><label>54</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Patel</surname><given-names>TA</given-names> </name><name name-style="western"><surname>Dillon</surname><given-names>KH</given-names> </name><name name-style="western"><surname>Cassiello-Robbins</surname><given-names>C</given-names> </name><name name-style="western"><surname>Calhoun</surname><given-names>PS</given-names> </name><name name-style="western"><surname>Beckham</surname><given-names>JC</given-names> </name><name name-style="western"><surname>Kimbrel</surname><given-names>NA</given-names> </name></person-group><article-title>Anger, impulsivity and wall/object punching in a sample of U.S. veterans with psychiatric disorders</article-title><source>J Psychiatr Res</source><year>2022</year><month>03</month><volume>147</volume><fpage>269</fpage><lpage>273</lpage><pub-id pub-id-type="doi">10.1016/j.jpsychires.2022.01.036</pub-id><pub-id pub-id-type="medline">35074743</pub-id></nlm-citation></ref><ref id="ref55"><label>55</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Nguyen</surname><given-names>CT</given-names> </name><name name-style="western"><surname>Richman</surname><given-names>JS</given-names> </name><name name-style="western"><surname>Chiles</surname><given-names>JW</given-names> </name><name name-style="western"><surname>Singh</surname><given-names>JA</given-names> </name><name name-style="western"><surname>McDonald</surname><given-names>MLN</given-names> </name></person-group><article-title>Osteoarthritis case identification in the Million Veteran Program cohort: comparison of diagnostic codes versus clinical notes</article-title><source>Osteoarthr Cartil Open</source><year>2026</year><month>06</month><volume>8</volume><issue>2</issue><fpage>100758</fpage><pub-id pub-id-type="doi">10.1016/j.ocarto.2026.100758</pub-id><pub-id pub-id-type="medline">41799924</pub-id></nlm-citation></ref><ref id="ref56"><label>56</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Moldwin</surname><given-names>A</given-names> </name><name name-style="western"><surname>Demner-Fushman</surname><given-names>D</given-names> </name><name name-style="western"><surname>Goodwin</surname><given-names>TR</given-names> </name></person-group><article-title>Empirical findings on the role of structured data, unstructured data, and their combination for automatic clinical phenotyping</article-title><source>AMIA Jt Summits Transl Sci Proc</source><year>2021</year><volume>2021</volume><fpage>445</fpage><lpage>454</lpage><pub-id pub-id-type="medline">34457160</pub-id></nlm-citation></ref><ref id="ref57"><label>57</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Blanchard</surname><given-names>G</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>G</given-names> </name><name name-style="western"><surname>Scott</surname><given-names>C</given-names> </name></person-group><article-title>Semi-supervised novelty detection</article-title><source>J Mach Learn Res</source><year>2010</year><month>12</month><day>1</day><volume>11</volume><fpage>2973</fpage><lpage>3009</lpage><pub-id pub-id-type="doi">10.5555/1756006.1953028</pub-id></nlm-citation></ref><ref id="ref58"><label>58</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zou</surname><given-names>R</given-names> </name><name name-style="western"><surname>D Williamson</surname><given-names>B</given-names> </name><name name-style="western"><surname>M Shortreed</surname><given-names>S</given-names> </name><name name-style="western"><surname>Coley</surname><given-names>RY</given-names> </name></person-group><article-title>Validation of a risk&#x2010;prediction model in the presence of outcome misclassification</article-title><source>Stat Med</source><year>2026</year><month>04</month><volume>45</volume><issue>8-9</issue><fpage>e70377</fpage><pub-id pub-id-type="doi">10.1002/sim.70377</pub-id></nlm-citation></ref><ref id="ref59"><label>59</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Felleman</surname><given-names>BI</given-names> </name><name name-style="western"><surname>Doran</surname><given-names>NM</given-names> </name><name name-style="western"><surname>Asamsama</surname><given-names>OH</given-names> </name><name name-style="western"><surname>Oliva</surname><given-names>EM</given-names> </name><name name-style="western"><surname>Han</surname><given-names>BH</given-names> </name></person-group><article-title>New methodology to improve tracking of veteran overdose deaths and characterization of a population of veteran overdose decedents in San Diego County</article-title><source>Drug Alcohol Depend Rep</source><year>2025</year><month>12</month><volume>17</volume><fpage>100392</fpage><pub-id pub-id-type="doi">10.1016/j.dadr.2025.100392</pub-id><pub-id pub-id-type="medline">41322677</pub-id></nlm-citation></ref><ref id="ref60"><label>60</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Brenner</surname><given-names>LA</given-names> </name><name name-style="western"><surname>Breshears</surname><given-names>RE</given-names> </name><name name-style="western"><surname>Betthauser</surname><given-names>LM</given-names> </name><etal/></person-group><article-title>Implementation of a suicide nomenclature within two VA healthcare settings</article-title><source>J Clin Psychol Med Settings</source><year>2011</year><month>06</month><volume>18</volume><issue>2</issue><fpage>116</fpage><lpage>128</lpage><pub-id pub-id-type="doi">10.1007/s10880-011-9240-9</pub-id><pub-id pub-id-type="medline">21626353</pub-id></nlm-citation></ref><ref id="ref61"><label>61</label><nlm-citation citation-type="report"><article-title>Inadequate staff training and lack of oversight contribute to the Veterans Health Administration&#x2019;s suicide risk screening and evaluation deficiencies</article-title><year>2024</year><month>12</month><day>18</day><access-date>2026-05-11</access-date><publisher-name>United States Department of Veterans Affairs</publisher-name><comment><ext-link ext-link-type="uri" xlink:href="https://www.vaoig.gov/sites/default/files/reports/2024-12/vaoig-23-02939-13.pdf">https://www.vaoig.gov/sites/default/files/reports/2024-12/vaoig-23-02939-13.pdf</ext-link></comment></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p><italic>ICD-CM</italic> codes and Cohen &#x03BA; coefficients. <italic>ICD-CM</italic>: <italic>International Classification of Diseases, Clinical Modification</italic>.</p><media xlink:href="jmir_v28i1e89071_app1.docx" xlink:title="DOCX File, 1753 KB"/></supplementary-material></app-group></back></article>