<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">J Med Internet Res</journal-id><journal-id journal-id-type="publisher-id">jmir</journal-id><journal-id journal-id-type="index">1</journal-id><journal-title>Journal of Medical Internet Research</journal-title><abbrev-journal-title>J Med Internet Res</abbrev-journal-title><issn pub-type="epub">1438-8871</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v27i1e76433</article-id><article-id pub-id-type="doi">10.2196/76433</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Multicriteria Optimization of Language Models for Heart Failure With Preserved Ejection Fraction Symptom Detection in Spanish Electronic Health Records: Comparative Modeling Study</article-title></title-group><contrib-group><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Mata</surname><given-names>Jacinto</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Pach&#x00F3;n</surname><given-names>Victoria</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Manovel</surname><given-names>Ana</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Ma&#x00F1;a</surname><given-names>Manuel J</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>de la Villa</surname><given-names>Manuel</given-names></name><degrees>MSc</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib></contrib-group><aff id="aff1"><institution>I&#x00B2;C Research Group</institution><addr-line>Universidad de Huelva</addr-line><addr-line>Huelva</addr-line><country>Spain</country></aff><aff id="aff2"><institution>Cardiology Department, Juan Ram&#x00F3;n Jim&#x00E9;nez University Hospital, Multidisciplinary Amyloidosis Unit Huelva, Hospital Juan Ram&#x00F3;n Jim&#x00E9;nez</institution><addr-line>Huelva</addr-line><country>Spain</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Sarvestan</surname><given-names>Javad</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Senst</surname><given-names>Benjamin</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Milic</surname><given-names>Marko Kimi</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Jacinto Mata, PhD, I&#x00B2;C Research Group, Universidad de Huelva, Huelva, 21007, Spain, +34 687862089; <email>mata@uhu.es</email></corresp></author-notes><pub-date pub-type="collection"><year>2025</year></pub-date><pub-date pub-type="epub"><day>17</day><month>7</month><year>2025</year></pub-date><volume>27</volume><elocation-id>e76433</elocation-id><history><date date-type="received"><day>23</day><month>04</month><year>2025</year></date><date date-type="rev-recd"><day>23</day><month>05</month><year>2025</year></date><date date-type="accepted"><day>26</day><month>05</month><year>2025</year></date></history><copyright-statement>&#x00A9; Jacinto Mata, Victoria Pach&#x00F3;n, Ana Manovel, Manuel J Ma&#x00F1;a, Manuel de la Villa. Originally published in the Journal of Medical Internet Research (<ext-link ext-link-type="uri" xlink:href="https://www.jmir.org">https://www.jmir.org</ext-link>), 17.7.2025. </copyright-statement><copyright-year>2025</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in the Journal of Medical Internet Research (ISSN 1438-8871), is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://www.jmir.org/">https://www.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://www.jmir.org/2025/1/e76433"/><abstract><sec><title>Background</title><p>Heart failure with preserved ejection fraction (HFpEF) is a major clinical manifestation of cardiac amyloidosis, a condition frequently underdiagnosed due to its nonspecific symptomatology. Electronic health records (EHRs) offer a promising avenue for supporting early symptom detection through natural language processing. However, identifying relevant clinical cues within unstructured narratives, particularly in Spanish, remains a significant challenge due to the scarcity of annotated corpora and domain-specific models. This study proposes and evaluates a Transformer-based natural language processing framework for automated detection of HFpEF-related symptoms in Spanish EHRs.</p></sec><sec><title>Objective</title><p>The aim of this study is to assess the feasibility of leveraging unstructured clinical narratives to support early identification of heart failure phenotypes indicative of cardiac amyloidosis. It also examines how domain-specific language models and clinically guided optimization strategies can improve the reliability, sensitivity, and generalizability of symptom detection in real-world EHRs.</p></sec><sec sec-type="methods"><title>Methods</title><p>A novel corpus of 15,304 Spanish clinical documents was manually annotated and validated by cardiology experts. The corpus was derived from the records of 262 patients (173 with suspected cardiac amyloidosis and 89 without). In total, 8 Transformer-based language models were evaluated, including general-purpose models, biomedical-specialized variants, and Longformers. Three clinically motivated optimization strategies were implemented to align models&#x2019; behavior with different diagnostic priorities: maximizing area under the curve (AUC) to enhance overall discrimination, optimizing <italic>F</italic><sub>1</sub>-score to balance sensitivity and precision, and prioritizing sensitivity to minimize false negatives. These strategies were independently applied during the fine-tuning of the models to assess their impact on performance under different clinical constraints. To ensure robust evaluation, testing was conducted on a dataset composed exclusively of previously unseen patients, allowing performance to be assessed under realistic and generalizable conditions.</p></sec><sec sec-type="results"><title>Results</title><p>All models achieved high performance, with AUC values above 0.940. The best-performing model, <italic>Longformer Biomedical-clinical</italic>, reached an AUC of 0.987, <italic>F</italic><sub>1</sub>-score of 0.985, sensitivity of 0.987, and specificity of 0.987 on the test dataset. Models optimized for sensitivity reduced the false-negative rate to under 3%, a key threshold for clinical safety. Comparative analyses confirmed that domain-adapted, long-sequence models are better suited for the semantic and structural complexity of Spanish clinical texts than general-purpose models.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>Transformer-based models can reliably detect HFpEF-related symptoms from Spanish EHRs, even in the presence of class imbalance and substantial linguistic complexity. The results show that combining domain-specific pretraining with long-context modeling architectures and clinically aligned optimization strategies leads to substantial gains in classification performance, particularly in sensitivity. These models not only achieve high accuracy and generalization on unseen patients but also demonstrate robustness in handling the semantic nuances and narrative structure of real-world clinical documentation. These findings support the potential deployment of Transformer-based systems as effective screening tools to prioritize patients at risk for cardiac amyloidosis in Spanish-speaking health care settings.</p></sec></abstract><kwd-group><kwd>natural language processing</kwd><kwd>transformer</kwd><kwd>clinical language models</kwd><kwd>manual corpus annotation</kwd><kwd>symptom extraction</kwd><kwd>early diagnosis support</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><sec id="s1-1"><title>Background</title><p>Heart failure (HF) represents a major public health challenge, with its prevalence rising as a result of population aging and improved survival among individuals with cardiovascular conditions. Among its phenotypes, heart failure with preserved ejection fraction (HFpEF) accounts for nearly 50% of cases and is characterized by congestion and dyspnea in the absence of marked systolic dysfunction. Despite its clinical relevance, HFpEF remains diagnostically challenging due to its pathophysiological and etiological heterogeneity [<xref ref-type="bibr" rid="ref1">1</xref>].</p><p>A key underlying cause of HFpEF is cardiac amyloidosis, an infiltrative disorder characterized by extracellular deposits of misfolded transthyretin protein that impair myocardial structure and function. The most prevalent variant is the wild type form of transthyretin cardiac amyloidosis (ATTR-CM), which predominantly affects older adults and leads to progressive cardiac dysfunction and worsening HF symptoms. Unlike other HFpEF etiologies, ATTR-CM is amenable to disease-modifying therapies that can slow progression, reduce hospitalizations, and improve survival outcomes. Nonetheless, early diagnosis remains difficult due to low clinical suspicion, nonspecific echocardiographic findings, and historically limited awareness within the medical community. Therapeutic efficacy is maximized when treatment is initiated early, underscoring the importance of timely and optimized detection strategies in HF populations [<xref ref-type="bibr" rid="ref2">2</xref>].</p><p>In this context, artificial intelligence (AI) has emerged as a promising approach for the early identification of disease within large-scale clinical datasets [<xref ref-type="bibr" rid="ref3">3</xref>]. Specifically, natural language processing (NLP) and deep learning models can analyze electronic health records (EHRs) to identify and extract mentions of HFpEF and associated terminology, facilitating the development of high-quality annotated corpora [<xref ref-type="bibr" rid="ref4">4</xref>]. Such corpora are essential for training AI models capable of detecting textual patterns indicative of ATTR-CM, enabling the prioritization of high-risk patients. Applying these technologies to the structured identification of HF mentions in clinical narratives can enhance diagnostic accuracy, support data-driven decision-making, and enable timely interventions that improve prognosis and quality of life.</p><p>Despite the growing interest in clinical NLP, the development of resources in Spanish remains limited. The scarcity of expert-annotated clinical corpora in Spanish hinders the creation and evaluation of language models tailored to this language. The lack of pretrained clinical language models in Spanish, compared to the resources available for English, further restricts the development of effective applications in this domain. Recently, Garc&#x00ED;a Subies et al [<xref ref-type="bibr" rid="ref5">5</xref>] conducted a study on the efficiency of encoder-based Transformer models in named entity recognition and classification tasks, aiming to identify the most effective resources in this context. The authors highlighted the significant gap in NLP resources for the Spanish language, particularly in the clinical sector. As described in their work, annotated corpora in Spanish are extremely scarce, and most are designed for named entity recognition tasks, with very few available for text classification. The authors noted that, when working with clinical narratives, encoder-based models currently outperform emerging generative language models. Their findings emphasize the urgent need to develop encoder-based models specialized in Spanish that can process clinical data with high precision. One of the most significant challenges in clinical NLP, both in general and particularly in Spanish, is the handling of negation and speculation in clinical texts. Although some initiatives such as <italic>NUBes</italic> [<xref ref-type="bibr" rid="ref6">6</xref>] have emerged, available resources to properly address this linguistic feature remain extremely limited. As a result, this remains one of the most relevant and complex issues currently faced by the scientific community in the field.</p><p>Our study aims to develop and validate an AI-based system for the automated detection of HFpEF mentions in Spanish EHRs, with the goals of supporting early detection of potential ATTR-CM cases, enhancing diagnostic efficiency, and mitigating underdiagnosis. As outlined in [<xref ref-type="bibr" rid="ref7">7</xref>], several research challenges remain in clinical NLP. This work addresses three of them: applying deep learning to clinical text classification, overcoming language-related barriers, and leveraging transfer learning for narrative clinical report classification.</p><p>To guide this work, we pose the following research questions (RQs):</p><p>RQ1. Can a manually annotated corpus of Spanish EHRs be effectively constructed and validated for the detection of HFpEF-related symptoms?</p><p>RQ2. What is the performance of general-purpose, biomedical-pretrained, and long-document, encoder-based Transformer models when applied to clinical text classification in Spanish?</p><p>RQ3. What is the impact of different optimization strategies (area under the curve [AUC], <italic>F</italic><sub>1</sub>-score, and sensitivity) on model performance for symptom detection?</p><p>RQ4. To what extent can encoder-based Transformer models support the early identification of HFpEF symptomatology indicative of cardiac amyloidosis in Spanish-language clinical narratives?</p><p>To address these questions, this study makes the following contributions: (1) the construction of a manually annotated corpus of Spanish EHRs for HFpEF symptom detection, validated by cardiology specialists and intended for public release; (2) a comparative evaluation of general, biomedical, and long-document Transformer models for clinical text classification; (3) an analysis of optimization strategies tailored to AUC, <italic>F</italic><sub>1</sub>-score, and sensitivity; and (4) a demonstration of the feasibility of using NLP models to support the early detection of HFpEF symptoms relevant to cardiac amyloidosis.</p></sec><sec id="s1-2"><title>Related Work</title><p>Transformer-based models have shown increasing effectiveness in extracting clinically relevant information from unstructured EHRs, particularly within the context of HF. Adejumo et al [<xref ref-type="bibr" rid="ref8">8</xref>] used ClinicalBERT to extract New York Heart Association classifications and HF symptoms from clinical notes, achieving area under the receiver operating characteristic curves (AUROCs) greater than 0.98 and identifying functional status in 83% more cases than explicit documentation alone. Similarly, Liu et al [<xref ref-type="bibr" rid="ref9">9</xref>] used ClinicalBERT embeddings within a predictive framework to identify early HF onset, outperforming traditional methods in a large-scale real-world dataset.</p><p>In the context of clinical trials, Marti-Castellote et al [<xref ref-type="bibr" rid="ref10">10</xref>] developed a hybrid NLP approach integrating Clinical Longformer and GPT-4o to adjudicate HF-related hospitalizations. Their system reproduced expert adjudications with 83% concordance, substantially reducing manual effort and demonstrating the scalability and reliability of Transformer-based models in high-stakes research. Ahmad et al [<xref ref-type="bibr" rid="ref11">11</xref>] provided a comprehensive overview of machine learning applications in HFpEF, emphasizing the potential of unsupervised learning techniques to identify novel patient subgroups and improve phenotypic classification. Their work underscores the importance of leveraging diverse data modalities, including clinical notes, to enhance diagnostic precision in HFpEF. Complementarily, Houssein et al [<xref ref-type="bibr" rid="ref12">12</xref>] developed an advanced NLP framework using stacked BERT and character embeddings to detect heart disease risk factors from clinical narratives. Their model achieved an <italic>F</italic><sub>1</sub>-score of 93.66% on the i2b2 dataset, demonstrating the efficacy of combining deep learning techniques for extracting critical clinical information. Houssein et al [<xref ref-type="bibr" rid="ref13">13</xref>] benchmarked 5 Transformer architectures (BERT, BioBERT, RoBERTa, XLNet, and BioClinicalBERT) on the i2b2 dataset for risk factor extraction in heart disease, achieving state-of-the-art performance with a micro <italic>F</italic><sub>1</sub>-score of 94.26%. These results reinforce the value of domain-specific Transformer models in clinical NLP applications. Notably, Fan et al [<xref ref-type="bibr" rid="ref14">14</xref>] implemented a Transformer-based clustering framework to identify 7 HF subtypes in a cohort of more than 379,000 patients. These subgroups, some of which were independent of left ventricular ejection fraction, highlight the capacity of deep learning approaches to uncover novel and clinically meaningful HF phenotypes, which is particularly relevant given the heterogeneity of HFpEF. These studies highlight the growing role of machine learning and NLP in advancing HF research and patient care. In the context of the Spanish language, recent work such as that of Garc&#x00ED;a Subies et al [<xref ref-type="bibr" rid="ref5">5</xref>] has explored the use of encoder-based Transformer models for named entity recognition and classification tasks in Spanish clinical texts. However, their efforts have focused on the evaluation and application of language models on existing corpora and have not addressed the detection of HFpEF-specific symptoms.</p><p>Despite these advances, most of the existing research focuses on English-language EHRs, frequently relying on structured data or datasets with broad cardiovascular end points. To date, only a limited number of studies have addressed the automated detection of HFpEF-specific symptoms and none, to our knowledge, have done so in Spanish-language clinical narratives. Moreover, existing models are rarely optimized for clinically critical metrics such as sensitivity, nor do they account for the long and complex structure of real-world clinical narratives. This study addresses these gaps by introducing a manually annotated Spanish-language corpus for HFpEF symptom detection, evaluating multiple Transformer architectures, including models adapted for long documents, and implementing task-specific optimization strategies aligned with diagnostic priorities. This represents the first demonstration of the feasibility and clinical applicability of Transformer-based NLP for detecting HFpEF in Spanish EHRs, with direct implications for improving early recognition of cardiac amyloidosis.</p></sec></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Ethical Considerations</title><p>This study was approved by the Andalusian Biomedical Research Ethics Coordinating Committee under protocol version 1 dated October 2, 2020 (internal protocol code 2382-N-20). The corpus documents were anonymized using a rigorous, tailor-made protocol specifically designed to protect patient data privacy.</p></sec><sec id="s2-2"><title>The Corpus</title><sec id="s2-2-1"><title>Overview</title><p>Cardiac amyloidosis may present with a broad spectrum of clinical manifestations. This study focuses on HFpEF, one of the most clinically significant manifestations, to evaluate the performance of language models in detecting its presence or absence within clinical documents. Accordingly, the experiments have been conducted using a dataset specifically annotated for HFpEF. The full corpus, comprising 15 datasets of manually annotated clinical documents covering all cardiac amyloidosis symptoms, will be introduced in future publications. This corpus will be made publicly available to the scientific community.</p></sec><sec id="s2-2-2"><title>Description</title><p>The corpus was compiled from the EHRs of 262 patients. The records of 173 patients who underwent testing for cardiac amyloidosis were selected and supplemented with the records of 89 patients who did not undergo these tests, aiming to mitigate overfitting in the models. These records were obtained from the Cardiology, Internal Medicine, Neurology, Neurophysiology, and Traumatology units at Juan Ram&#x00F3;n Jim&#x00E9;nez Hospital in Huelva, Spain. The selection of documents was based on patient records rather than predefined time intervals. Consequently, the temporal coverage of the dataset corresponds to the full range of available clinical documentation for each patient. Specifically, the corpus spans from 2007 to 2021, reflecting the period during which these patients received care and generated clinical records within the EHR system.</p><p>Following a detailed analysis to determine the most relevant document types for this study, the following clinical records were selected: <italic>anamnesis</italic>, <italic>consultation reports</italic>, <italic>discharge reports</italic>, and <italic>progress notes</italic>. To accommodate language model context limitations, documents were segmented into clinically relevant sections: <italic>current illness</italic>, <italic>clinical assessment</italic>, <italic>reason for consultation</italic>, <italic>medical history</italic>, and <italic>complementary tests</italic>. For progress notes, only the <italic>evolution</italic>, <italic>clinical assessment</italic>, and <italic>complementary test</italic> sections were retained to prioritize relevant content and reduce redundancy.</p><p><xref ref-type="table" rid="table1">Table 1</xref> provides the total number of documents per type, along with the overall corpus size. A data cleaning process was performed to eliminate duplicate documents, system-generated records without clinical content, and incomplete texts (eg, entries with minimal or blank content). Additionally, documents exhibiting a high degree of textual overlap were removed. This step was necessary because in many cases, clinical documents are generated as continuations of previous notes, incorporating incremental updates (eg, results of new tests or follow-up visits). To avoid excessive redundancy and reduce the risk of overfitting due to repetitive content, we retained only documents corresponding to distinct clinical episodes. When multiple documents were associated with the same episode, we preserved the longest version, as it typically contained the most comprehensive clinical information. Minor extraction errors were also corrected. As a result of this process, the corpus was reduced from 30,367 to 15,304 documents. Of these, 11,025 (72%) correspond to patients with suspected cardiac amyloidosis (n=173), and 4279 (28%) to control patients without the disease (n=89).</p><p><xref ref-type="table" rid="table2">Table 2</xref> presents summary statistics of document length by type, providing an initial overview of the textual characteristics that motivate the modeling approach adopted in this study. The documents in the corpus show considerable variability in length, with average values ranging from approximately 60 to over 120 words across document types. In all cases, the mean values exceed the median, suggesting a positively skewed distribution characterized by the presence of a relevant subset of longer documents that contribute to an extended upper tail. This is further supported by high standard deviation values and maximum word counts exceeding 2000 in some cases. These characteristics underscore the need for using long-context Transformer models capable of handling extended sequences without truncation, as will be demonstrated in the experimentation and results sections of this study. A sample of the dataset is presented in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> .</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Number of documents by type.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Document type</td><td align="left" valign="bottom">Number of documents</td></tr></thead><tbody><tr><td align="left" valign="top">Anamnesis</td><td align="left" valign="top">5040</td></tr><tr><td align="left" valign="top">Consultation reports</td><td align="left" valign="top">5235</td></tr><tr><td align="left" valign="top">Discharge reports</td><td align="left" valign="top">3970</td></tr><tr><td align="left" valign="top">Progress notes</td><td align="left" valign="top">16,122</td></tr><tr><td align="left" valign="top">Total</td><td align="left" valign="top">30,367</td></tr><tr><td align="left" valign="top">Total after cleaning process</td><td align="left" valign="top">15,304</td></tr></tbody></table></table-wrap><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Summary statistics of document length (in words) by document type. For each type of clinical note included in the corpus, the table presents the mean, median, standard deviation, and maximum number of words per document. The values reflect the variability in structure and verbosity across clinical document genres.</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Document type</td><td align="left" valign="bottom">Mean (SD)</td><td align="left" valign="bottom">Median</td><td align="left" valign="bottom">Maximum</td></tr></thead><tbody><tr><td align="left" valign="top">Anamnesis</td><td align="left" valign="top">90.12 (139.3)</td><td align="left" valign="top">40</td><td align="left" valign="top">1581</td></tr><tr><td align="left" valign="top">Consultation reports</td><td align="left" valign="top">124.41 (156.45)</td><td align="left" valign="top">62</td><td align="left" valign="top">1228</td></tr><tr><td align="left" valign="top">Discharge reports</td><td align="left" valign="top">116.59 (172.04)</td><td align="left" valign="top">52</td><td align="left" valign="top">2253</td></tr><tr><td align="left" valign="top">Progress notes</td><td align="left" valign="top">62.93 (75.13)</td><td align="left" valign="top">37</td><td align="left" valign="top">852</td></tr></tbody></table></table-wrap></sec><sec id="s2-2-3"><title>Labeling Process</title><p>This section outlines the corpus labeling process, with particular emphasis on the collection related to HFpEF, as this dataset was used for the experiments presented in this study. The annotation process followed a rigorous methodology appropriate for this type of task. Given the dataset&#x2019;s size, its manual annotation required 6 months, highlighting the complexity of this phase of the study.</p><p>Based on prior evaluations of annotation tools [<xref ref-type="bibr" rid="ref15">15</xref>], Prodigy [<xref ref-type="bibr" rid="ref16">16</xref>] was selected due to its adaptability to the project&#x2019;s requirements and its ease of use. To enhance the annotators&#x2019; efficiency, improvements were made to the tool&#x2019;s functionality, and a user-friendly interface was designed to streamline the annotation process. <xref ref-type="fig" rid="figure1">Figure 1</xref> illustrates the interface developed for this task.</p><p>To ensure consistency and standardization in the annotation process, an annotation guideline was developed under the supervision of a cardiology specialist. This guideline provides a detailed framework on relevant linguistic elements (terms, acronyms, and expressions, among others) that annotators must consider in determining the presence (positive case) or absence (negative case) of the symptom in the analyzed documents. <xref ref-type="table" rid="table3">Table 3</xref> presents an excerpt from the annotation guide, illustrating some of the expressions used to indicate HF. Manual labeling was carried out by 2 specialists in medical documentation. To minimize annotation bias, the task was performed independently by each annotator, without mutual interaction. Discrepancies were resolved by the cardiology specialist who authored the annotation guidelines, ensuring consistency and validity.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Prodigy interface for dataset annotation.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v27i1e76433_fig01.png"/></fig><table-wrap id="t3" position="float"><label>Table 3.</label><caption><p>Excerpt from the annotation guide with some of the expressions used to label the dataset (English translation is included to facilitate reading).</p></caption><table id="table3" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom" colspan="2">Language and expressions indicating the presence of the symptom</td><td align="left" valign="bottom">Expressions indicating the absence of the symptom</td></tr></thead><tbody><tr><td align="left" valign="top" colspan="3"><bold>Spanish</bold></td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"><italic>Ingreso por ICC</italic><sup><xref ref-type="table-fn" rid="table3fn1">a</xref></sup> <italic>con FEVI</italic><sup><xref ref-type="table-fn" rid="table3fn2">b</xref></sup> <italic>preservada</italic></td><td align="left" valign="top"><italic>Sin signos de IC</italic><sup><xref ref-type="table-fn" rid="table3fn3">c</xref></sup></td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"><italic>I Cardiaca con FE</italic><sup><xref ref-type="table-fn" rid="table3fn4">d</xref></sup> <italic>conservada</italic></td><td align="left" valign="top"><italic>ICC sin especificar si preservada o reducida</italic></td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"><italic>Insuficiencia cardiaca diast&#x00F3;lica</italic></td><td align="left" valign="top"><italic>Insuficiencia cardiaca con FEVI reducida</italic></td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"><italic>IC con funci&#x00F3;n sist&#x00F3;lica conservada o preservada</italic></td><td align="left" valign="top"><italic>IC con FE disminuida</italic></td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"><italic>CF</italic><sup><xref ref-type="table-fn" rid="table3fn5">e</xref></sup> <italic>III</italic></td><td align="left" valign="top"><italic>IC sist&#x00F3;lica</italic></td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"><italic>EAP</italic><sup><xref ref-type="table-fn" rid="table3fn6">f</xref></sup></td><td align="left" valign="top"/></tr><tr><td align="left" valign="top"/><td align="left" valign="top"><italic>Edema agudo de pulm&#x00F3;n</italic></td><td align="left" valign="top"/></tr><tr><td align="left" valign="top"/><td align="left" valign="top">NYHA<sup><xref ref-type="table-fn" rid="table3fn7">g</xref></sup> III</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top" colspan="3"><bold>English</bold></td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Admitted for CHF<sup><xref ref-type="table-fn" rid="table3fn8">h</xref></sup> with preserved LVEF<sup><xref ref-type="table-fn" rid="table3fn9">i</xref></sup></td><td align="left" valign="top">No signs of HF<sup><xref ref-type="table-fn" rid="table3fn10">j</xref></sup></td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Heart failure with preserved EF<sup><xref ref-type="table-fn" rid="table3fn11">k</xref></sup></td><td align="left" valign="top">CHF unspecified if preserved or reduced</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Diastolic heart failure</td><td align="left" valign="top">Heart failure with reduced LVEF</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">HF with preserved or conserved systolic function</td><td align="left" valign="top">HF with decreased EF</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Class III</td><td align="left" valign="top">Systolic HF</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">APE<sup><xref ref-type="table-fn" rid="table3fn12">l</xref></sup></td><td align="left" valign="top"/></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Acute pulmonary edema</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top"/><td align="left" valign="top">NYHA III</td><td align="left" valign="top"/></tr></tbody></table><table-wrap-foot><fn id="table3fn1"><p><sup>a</sup>ICC: <italic>insuficiencia cardiaca congestiva</italic>.</p></fn><fn id="table3fn2"><p><sup>b</sup>FEVI: <italic>fracci&#x00F3;n de eyecci&#x00F3;n del ventr&#x00ED;culo izquierdo</italic>.</p></fn><fn id="table3fn3"><p><sup>c</sup>IC: <italic>insuficiencia cardiaca</italic>.</p></fn><fn id="table3fn4"><p><sup>d</sup>FE: <italic>fracci&#x00F3;n de eyecci&#x00F3;n</italic>.</p></fn><fn id="table3fn5"><p><sup>e</sup>CF: <italic>clase funcional</italic>.</p></fn><fn id="table3fn6"><p><sup>f</sup>EAP: <italic>edema agudo pulmonar</italic>.</p></fn><fn id="table3fn7"><p><sup>g</sup>NYHA: New York Heart Association.</p></fn><fn id="table3fn8"><p><sup>h</sup>CHF: congestive heart failure.</p></fn><fn id="table3fn9"><p><sup>i</sup>LVEF: left ventricular ejection fraction.</p></fn><fn id="table3fn10"><p><sup>j</sup>HF: heart failure.</p></fn><fn id="table3fn11"><p><sup>k</sup>EF: ejection fraction.</p></fn><fn id="table3fn12"><p><sup>l</sup>APE: acute pulmonary edema.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s2-2-4"><title>Annotation Statistics</title><p>To quantitatively assess the quality and consistency of the manual annotation process, we computed interannotator agreement metrics and analyzed the final distribution of labels across the dataset. Following the labeling process described above, interannotator agreement was assessed across the full dataset of 15,304 documents. A total of 14,610 agreements and 694 disagreements were recorded between the 2 annotators. The distribution of label combinations was as follows: both annotators assigned label &#x201C;0&#x201D; in 13,910 cases and label &#x201C;1&#x201D; in 700 cases, while disagreements involved 499 (label &#x201C;0&#x201D; vs label &#x201C;1&#x201D;) and 195 (label &#x201C;1&#x201D; vs label &#x201C;0&#x201D;) instances. Based on this distribution, the Cohen &#x03BA; coefficient was calculated, yielding a value of 0.645, which indicates substantial agreement and supports the reliability of the annotation process. The distribution of annotation combinations is summarized in <xref ref-type="fig" rid="figure2">Figure 2</xref>, which presents the confusion matrix of interannotator agreement.</p><p>All disagreements were adjudicated by a cardiology specialist in accordance with the predefined guidelines. Of the 694 discordant cases, 254 were assigned label &#x201C;0&#x201D; (absence of symptom) and 440 were assigned label &#x201C;1&#x201D; (presence of symptom). Following this resolution, the final distribution of labels in the dataset comprised 1140 documents labeled as &#x201C;1&#x201D; (7.5%) and 14,164 as &#x201C;0&#x201D; (92.5%), reflecting a high level of class imbalance that increases the complexity of the classification task.</p><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>Confusion matrix showing interannotator agreement. Values represent the number of documents assigned to each label by annotator A and annotator B.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v27i1e76433_fig02.png"/></fig></sec></sec><sec id="s2-3"><title>Experimentation Framework</title><p>The Transformer architecture has become a cornerstone in NLP, offering high efficiency and versatility across a wide range of tasks. Its ability to model long-range dependencies through attention mechanisms has revolutionized the field, driving significant advancements in tasks such as text classification [<xref ref-type="bibr" rid="ref17">17</xref>-<xref ref-type="bibr" rid="ref19">19</xref>]. This architecture is distinguished not only by its accuracy but also by its scalability and adaptability across diverse domains and datasets of varying sizes. Transfer learning allows for domain-specific adaptation through fine-tuning, as applied here to the health care domain [<xref ref-type="bibr" rid="ref20">20</xref>]. By leveraging pretrained models that have already learned general language representations from large, diverse corpora, transfer learning facilitates their specialization for more domain-specific datasets through additional training. This adaptability has established Transformers as a powerful tool for advancing NLP in specialized fields.</p><p>A key limitation of Transformer models is their inherent constraint in processing large texts. Their architecture is typically designed to handle a maximum of 512 tokens, posing challenges for tasks that require longer sequences, such as document-level analysis or summarization of extensive texts. To address this limitation, the Longformer architecture [<xref ref-type="bibr" rid="ref21">21</xref>], specifically designed to efficiently handle longer text sequences, was introduced. Longformer incorporates sparse attention mechanisms, allowing it to extend the context window while preserving computational efficiency, thus mitigating a core limitation of traditional Transformer models. This innovation broadens the applicability of Transformer models to tasks involving lengthy documents, such as EHRs. These considerations motivated the inclusion of 3 model categories in this study: general-purpose Transformers, domain-adapted clinical models, and Longformers. This categorization allows for a comprehensive comparison of performance across different levels of domain specialization and sequence-length flexibility. General-purpose models serve as a baseline, while fine-tuned clinical models assess the added value of domain-specific adaptation. Longformers, by contrast, address the limitations of standard sequence lengths and offer insight into the role of extended context modeling in clinical NLP.</p><p>In clinical contexts, maximizing recall for the positive class (Recall-1) is often prioritized to capture as many at-risk cases as possible. However, this strategy can lead to an increased number of false positives, elevating alert rates and potentially burdening health care systems. To address this trade-off, we incorporated additional evaluation metrics, specifically the AUC and the <italic>F</italic><sub>1</sub>-score, to support a more balanced classification approach. Each metric provides a distinct perspective: AUC assesses the model&#x2019;s overall ability to discriminate between positive and negative cases, while <italic>F</italic><sub>1</sub>-score balances precision and sensitivity, helping to reduce both false positives and false negatives.</p><sec id="s2-3-1"><title>Models</title><p>As described in the previous section, this study evaluated 3 categories of encoder-based Transformer models: 2 general-purpose, 4 domain-adapted clinical models, and 2 Longformers. General-purpose models serve as a baseline to assess performance in the absence of domain adaptation, whereas fine-tuned clinical models illustrate the effects of transfer learning on domain-specific performance. Longformer models, designed to efficiently process extended text sequences, were included to address tasks requiring long-range contextual understanding, such as detailed medical reports or full-length patient narratives.</p><list list-type="order"><list-item><p><italic>BETO</italic>. The Spanish-BERT model [<xref ref-type="bibr" rid="ref22">22</xref>] follows the same architecture as BERT-Base and was trained exclusively on Spanish corpora, including Wikipedia and OPUS data. Specifically, we used the <italic>bert-base-spanish-wwm-cased</italic> model.</p></list-item><list-item><p><italic>RoBERTa</italic>. A robustly optimized BERT pretraining approach, RoBERTa is an improved version of BERT that incorporates modifications to key hyperparameters [<xref ref-type="bibr" rid="ref23">23</xref>]. Several Spanish-language RoBERTa models are available. We used the version pretrained on corpora from the National Library of Spain [<xref ref-type="bibr" rid="ref24">24</xref>]. Specifically, experiments were conducted using the base version, <italic>roberta-base-bne</italic>.</p></list-item><list-item><p><italic>RoBERTa-biomedical</italic> and <italic>RoBERTa-biomedical-clinical</italic> [<xref ref-type="bibr" rid="ref25">25</xref>] are monolingual Spanish RoBERTa-based models trained on a large biomedical and clinical corpus of over 1 billion tokens. These models have demonstrated strong performance in prior Spanish-language biomedical and clinical NLP benchmarks. The specific versions used in this study were <italic>roberta-base-biomedical-es</italic> and <italic>roberta-base-biomedical-clinical-es</italic>.</p></list-item><list-item><p><italic>bsc-bio</italic> and <italic>bsc-bio-ehr</italic> [<xref ref-type="bibr" rid="ref26">26</xref>] are recently developed Spanish-language models, derived from <italic>roberta-base-biomedical-es</italic> and <italic>roberta-base-biomedical-clinical-es</italic>, respectively, and trained on expanded corpora to improve performance. Specifically, <italic>bsc-bio</italic> is a pretrained model for processing biomedical and clinical texts, suitable for tasks such as literature analysis, information extraction, and interpretation of medical guidelines. In contrast, <italic>bsc-bio-ehr</italic> is specifically adapted for processing EHRs and clinical notes.</p></list-item><list-item><p><italic>Long Transformer RoBERTa</italic>. This model is a Longformer-based adaptation of RoBERTa for the Spanish language [<xref ref-type="bibr" rid="ref24">24</xref>]. It combines sliding-window (local) and global attention mechanisms, enabling linear scalability with sequence length and facilitating the processing of documents with thousands of tokens. For this study, we used the model <italic>longformer-base-4096-bne-es</italic>.</p></list-item><list-item><p><italic>Long Transformer Biomedical-clinical</italic>. This model extends the RoBERTa-based architecture through Longformer adaptations tailored for Spanish biomedical and clinical texts [<xref ref-type="bibr" rid="ref26">26</xref>]. Initialized from the <italic>roberta-base-biomedical-clinical-es</italic> checkpoint, it underwent further fine-tuning using masked language modeling, specifically targeting long biomedical and clinical documents. We used the model <italic>longformer-base-4096-biomedical-clinical-es</italic> in our experiments. This model, explicitly designed for processing extended biomedical and clinical texts in Spanish, leverages domain-specific pretraining to achieve strong performance in NLP tasks within the health care domain, including the classification of clinical notes.</p></list-item></list></sec><sec id="s2-3-2"><title>Evaluation Metrics</title><p>The models were evaluated using standard performance metrics commonly used in binary classification tasks for clinical documents. Specifically, the following metrics were considered.</p><sec id="s2-3-2-1"><title>Precision</title><p>Precision is the proportion of correctly predicted positive clinical documents relative to the total number of documents classified as positive. It is computed as:</p><disp-formula id="E1"><label>(1)</label><mml:math id="eqn1"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mi>p</mml:mi><mml:mi>r</mml:mi><mml:mi>e</mml:mi><mml:mi>c</mml:mi><mml:mi>i</mml:mi><mml:mi>s</mml:mi><mml:mi>i</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi><mml:mo>=</mml:mo><mml:mtext>&#x00A0;</mml:mtext><mml:mfrac><mml:mrow><mml:mi>t</mml:mi><mml:mi>r</mml:mi><mml:mi>u</mml:mi><mml:mi>e</mml:mi><mml:mtext>&#x00A0;</mml:mtext><mml:mi>p</mml:mi><mml:mi>o</mml:mi><mml:mi>s</mml:mi><mml:mi>i</mml:mi><mml:mi>t</mml:mi><mml:mi>i</mml:mi><mml:mi>v</mml:mi><mml:mi>e</mml:mi><mml:mi>s</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi><mml:mi>r</mml:mi><mml:mi>u</mml:mi><mml:mi>e</mml:mi><mml:mtext>&#x00A0;</mml:mtext><mml:mi>p</mml:mi><mml:mi>o</mml:mi><mml:mi>s</mml:mi><mml:mi>i</mml:mi><mml:mi>t</mml:mi><mml:mi>i</mml:mi><mml:mi>v</mml:mi><mml:mi>e</mml:mi><mml:mi>s</mml:mi><mml:mtext>&#x00A0;</mml:mtext><mml:mo>+</mml:mo><mml:mtext>&#x00A0;</mml:mtext><mml:mi>f</mml:mi><mml:mi>a</mml:mi><mml:mi>l</mml:mi><mml:mi>s</mml:mi><mml:mi>e</mml:mi><mml:mtext>&#x00A0;</mml:mtext><mml:mi>p</mml:mi><mml:mi>o</mml:mi><mml:mi>s</mml:mi><mml:mi>i</mml:mi><mml:mi>t</mml:mi><mml:mi>i</mml:mi><mml:mi>v</mml:mi><mml:mi>e</mml:mi><mml:mi>s</mml:mi></mml:mrow></mml:mfrac></mml:mrow></mml:mstyle></mml:math></disp-formula></sec><sec id="s2-3-2-2"><title>Recall (Sensitivity)</title><p>Recall is the proportion of correctly predicted positive clinical documents relative to the total number of documents classified as positive. It is computed as:</p><disp-formula id="E2"><label>(2)</label><mml:math id="eqn2"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mi>s</mml:mi><mml:mi>e</mml:mi><mml:mi>n</mml:mi><mml:mi>s</mml:mi><mml:mi>i</mml:mi><mml:mi>t</mml:mi><mml:mi>i</mml:mi><mml:mi>v</mml:mi><mml:mi>i</mml:mi><mml:mi>t</mml:mi><mml:mi>y</mml:mi><mml:mo>=</mml:mo><mml:mtext>&#x00A0;</mml:mtext><mml:mfrac><mml:mrow><mml:mi>t</mml:mi><mml:mi>r</mml:mi><mml:mi>u</mml:mi><mml:mi>e</mml:mi><mml:mtext>&#x00A0;</mml:mtext><mml:mi>p</mml:mi><mml:mi>o</mml:mi><mml:mi>s</mml:mi><mml:mi>i</mml:mi><mml:mi>t</mml:mi><mml:mi>i</mml:mi><mml:mi>v</mml:mi><mml:mi>e</mml:mi><mml:mi>s</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi><mml:mi>r</mml:mi><mml:mi>u</mml:mi><mml:mi>e</mml:mi><mml:mtext>&#x00A0;</mml:mtext><mml:mi>p</mml:mi><mml:mi>o</mml:mi><mml:mi>s</mml:mi><mml:mi>i</mml:mi><mml:mi>t</mml:mi><mml:mi>i</mml:mi><mml:mi>v</mml:mi><mml:mi>e</mml:mi><mml:mi>s</mml:mi><mml:mtext>&#x00A0;</mml:mtext><mml:mo>+</mml:mo><mml:mtext>&#x00A0;</mml:mtext><mml:mi>f</mml:mi><mml:mi>a</mml:mi><mml:mi>l</mml:mi><mml:mi>s</mml:mi><mml:mi>e</mml:mi><mml:mtext>&#x00A0;</mml:mtext><mml:mi>n</mml:mi><mml:mi>e</mml:mi><mml:mi>g</mml:mi><mml:mi>a</mml:mi><mml:mi>t</mml:mi><mml:mi>i</mml:mi><mml:mi>v</mml:mi><mml:mi>e</mml:mi><mml:mi>s</mml:mi></mml:mrow></mml:mfrac></mml:mrow></mml:mstyle></mml:math></disp-formula></sec><sec id="s2-3-2-3"><title><italic>F</italic><sub>1</sub>-Score</title><p><italic>F</italic><sub>1</sub>-score is the harmonic mean of precision and recall, providing a single metric that balances both aspects of classification performance. It is computed as:</p><disp-formula id="E3"><label>(3)</label><mml:math id="eqn3"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mi>F</mml:mi><mml:mrow><mml:mn mathvariant="italic">1</mml:mn></mml:mrow><mml:mo>&#x2212;</mml:mo><mml:mi>s</mml:mi><mml:mi>c</mml:mi><mml:mi>o</mml:mi><mml:mi>r</mml:mi><mml:mi>e</mml:mi><mml:mo>=</mml:mo><mml:mtext>&#x00A0;</mml:mtext><mml:mfrac><mml:mrow><mml:mn>2</mml:mn><mml:mtext>&#x00A0;</mml:mtext><mml:mo>&#x2217;</mml:mo><mml:mtext>&#x00A0;</mml:mtext><mml:mi>p</mml:mi><mml:mi>r</mml:mi><mml:mi>e</mml:mi><mml:mi>c</mml:mi><mml:mi>i</mml:mi><mml:mi>s</mml:mi><mml:mi>i</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi><mml:mtext>&#x00A0;</mml:mtext><mml:mo>&#x2217;</mml:mo><mml:mtext>&#x00A0;</mml:mtext><mml:mi>r</mml:mi><mml:mi>e</mml:mi><mml:mi>c</mml:mi><mml:mi>a</mml:mi><mml:mi>l</mml:mi><mml:mi>l</mml:mi></mml:mrow><mml:mrow><mml:mi>p</mml:mi><mml:mi>r</mml:mi><mml:mi>e</mml:mi><mml:mi>c</mml:mi><mml:mi>i</mml:mi><mml:mi>s</mml:mi><mml:mi>i</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi><mml:mtext>&#x00A0;</mml:mtext><mml:mo>+</mml:mo><mml:mtext>&#x00A0;</mml:mtext><mml:mi>r</mml:mi><mml:mi>e</mml:mi><mml:mi>c</mml:mi><mml:mi>a</mml:mi><mml:mi>l</mml:mi><mml:mi>l</mml:mi></mml:mrow></mml:mfrac></mml:mrow></mml:mstyle></mml:math></disp-formula></sec><sec id="s2-3-2-4"><title>Specificity</title><p>Specificity is the proportion of correctly predicted negative clinical documents relative to the total number of actual negative documents. It is computed as:</p><disp-formula id="E4"><label>(4)</label><mml:math id="eqn4"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mi>s</mml:mi><mml:mi>p</mml:mi><mml:mi>e</mml:mi><mml:mi>c</mml:mi><mml:mi>i</mml:mi><mml:mi>f</mml:mi><mml:mi>i</mml:mi><mml:mi>t</mml:mi><mml:mi>y</mml:mi><mml:mo>=</mml:mo><mml:mtext>&#x00A0;</mml:mtext><mml:mfrac><mml:mrow><mml:mi>t</mml:mi><mml:mi>r</mml:mi><mml:mi>u</mml:mi><mml:mi>e</mml:mi><mml:mtext>&#x00A0;</mml:mtext><mml:mi>n</mml:mi><mml:mi>e</mml:mi><mml:mi>g</mml:mi><mml:mi>a</mml:mi><mml:mi>t</mml:mi><mml:mi>i</mml:mi><mml:mi>v</mml:mi><mml:mi>e</mml:mi><mml:mi>s</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi><mml:mi>r</mml:mi><mml:mi>u</mml:mi><mml:mi>e</mml:mi><mml:mtext>&#x00A0;</mml:mtext><mml:mi>n</mml:mi><mml:mi>e</mml:mi><mml:mi>g</mml:mi><mml:mi>a</mml:mi><mml:mi>t</mml:mi><mml:mi>i</mml:mi><mml:mi>v</mml:mi><mml:mi>e</mml:mi><mml:mi>s</mml:mi><mml:mtext>&#x00A0;</mml:mtext><mml:mo>+</mml:mo><mml:mtext>&#x00A0;</mml:mtext><mml:mi>f</mml:mi><mml:mi>a</mml:mi><mml:mi>l</mml:mi><mml:mi>s</mml:mi><mml:mi>e</mml:mi><mml:mtext>&#x00A0;</mml:mtext><mml:mi>p</mml:mi><mml:mi>o</mml:mi><mml:mi>s</mml:mi><mml:mi>i</mml:mi><mml:mi>t</mml:mi><mml:mi>i</mml:mi><mml:mi>v</mml:mi><mml:mi>e</mml:mi><mml:mi>s</mml:mi></mml:mrow></mml:mfrac></mml:mrow></mml:mstyle></mml:math></disp-formula></sec><sec id="s2-3-2-5"><title>AUROC Metric</title><p>The AUROC is a performance metric used to evaluate binary classification models. It measures the model&#x2019;s ability to distinguish between positive and negative classes, independent of the decision threshold. The ROC curve plots the true positive rate (or sensitivity) against the false positive rate (or 1-specificity) across different threshold values. For all metrics, a <italic>positive document</italic> indicates that it contains the symptom, while a <italic>negative document</italic> indicates that the symptom is not found within the text of the document.</p></sec></sec></sec><sec id="s2-4"><title>Experimental Methodology</title><sec id="s2-4-1"><title>Overview</title><p>Building robust models for detecting complex symptoms, such as those associated with HF in clinical contexts, requires fine-tuning of pretrained models and optimizing both hyperparameters and training strategies for the target task. To this end, the experimental framework was structured into two phases: (1) an initial hyperparameter search using a reduced dataset and (2) a subsequent full training and evaluation phase. During Phase 1, multiple hyperparameter configurations were systematically evaluated for each target metric, with the goal of optimizing models to maximize AUC, <italic>F</italic><sub>1</sub>-score, or recall for the positive class. Phase 2 applied the optimal hyperparameter configurations identified previously to train each model on the full dataset and evaluate performance on the test dataset, enabling the selection of the best-performing configuration per strategy.</p><p>Regarding the preprocessing and normalization of the documents, all texts in the dataset were processed using the tokenizers associated with each pretrained model. No explicit preprocessing was applied in this study to preserve the original lexical and syntactic structure of the clinical narratives. This decision was motivated by the architecture of Transformer-based models, which are designed to capture contextual dependencies directly from raw, unprocessed text.</p></sec><sec id="s2-4-2"><title>Phase 1: Hyperparameter Tuning</title><p>To limit computational demands, a further reduced subset of the training dataset was used during the hyperparameter search. This subset included 2000 randomly sampled instances obtained through undersampling, comprising 1500 Class &#x201C;0&#x201D; and 500 Class &#x201C;1&#x201D; examples. The reduction was performed in a proportionally weighted manner to preserve the original distribution of document types within the majority class. <xref ref-type="table" rid="table4">Table 4</xref> summarizes the hyperparameter search space used in our experimentation.</p><p>A comprehensive grid search was conducted to evaluate all possible combinations of defined parameter values. This approach allows for systematic exploration of the hyperparameter space to identify the optimal configuration for each target performance metric. Optimization was carried out independently for each predefined strategy: (1) maximizing AUC, (2) maximizing <italic>F</italic><sub>1</sub>-score, and (3) maximizing recall (sensitivity) for the positive class.</p><p><xref ref-type="table" rid="table5">Tables 5</xref><xref ref-type="table" rid="table6"/>-<xref ref-type="table" rid="table7">7</xref> present the models and their optimal configurations for each optimization strategy, detailing key parameters such as batch size, learning rate, weight decay, and optimizer. This systematic, metric-driven optimization process enables the identification of the ideal configuration for each model based on the target metric, thereby enhancing model performance according to the specific requirements of the symptom classification task. As shown in these tables, the optimal hyperparameter values vary depending on the optimization strategy used. For instance, the bio model achieves better performance in AUC optimization when trained with a batch size of 16. However, its performance improves for <italic>F</italic><sub>1</sub>-score and sensitivity optimization when batch sizes of 32 and 8 are used, respectively. By conducting an exhaustive search for these values, the final models attained optimal performance for each metric.</p><table-wrap id="t4" position="float"><label>Table 4.</label><caption><p>Hyperparameter search space.</p></caption><table id="table4" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Hyperparameter</td><td align="left" valign="bottom">Values</td></tr></thead><tbody><tr><td align="left" valign="top">Number of epochs</td><td align="left" valign="top">10 (using early stopping)</td></tr><tr><td align="left" valign="top">Batch size</td><td align="left" valign="top">[8, 16, 32]</td></tr><tr><td align="left" valign="top">Learning rate</td><td align="left" valign="top">[2e-5, 3e-5, 5e-5]</td></tr><tr><td align="left" valign="top">Weight decay</td><td align="left" valign="top">[0.1, 0.01, 0.001]</td></tr><tr><td align="left" valign="top">Optimizer</td><td align="left" valign="top">[adamw_hf, adamw_torch, adafactor]</td></tr></tbody></table></table-wrap><table-wrap id="t5" position="float"><label>Table 5.</label><caption><p>Best hyperparameter values for the area under the curve optimization strategy.</p></caption><table id="table5" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Model</td><td align="left" valign="bottom">Batch size</td><td align="left" valign="bottom">Learning rate</td><td align="left" valign="bottom">Weight decay</td><td align="left" valign="bottom">Optimizer</td></tr></thead><tbody><tr><td align="left" valign="top">BETO</td><td align="left" valign="top">8</td><td align="left" valign="top">5e-05</td><td align="left" valign="top">0.001</td><td align="left" valign="top">adamw_torch</td></tr><tr><td align="left" valign="top">RoBERTa</td><td align="left" valign="top">8</td><td align="left" valign="top">2e-05</td><td align="left" valign="top">0.01</td><td align="left" valign="top">adafactor</td></tr><tr><td align="left" valign="top">RoBERTa-biomedical</td><td align="left" valign="top">16</td><td align="left" valign="top">2e-05</td><td align="left" valign="top">0.01</td><td align="left" valign="top">adamw_torch</td></tr><tr><td align="left" valign="top">RoBERTa-biomedical-clinical</td><td align="left" valign="top">16</td><td align="left" valign="top">5e-05</td><td align="left" valign="top">0.001</td><td align="left" valign="top">adamw_torch</td></tr><tr><td align="left" valign="top">bsc-bio</td><td align="left" valign="top">16</td><td align="left" valign="top">5e-05</td><td align="left" valign="top">0.001</td><td align="left" valign="top">adafactor</td></tr><tr><td align="left" valign="top">bsc-bio-ehr</td><td align="left" valign="top">32</td><td align="left" valign="top">3e-05</td><td align="left" valign="top">0.01</td><td align="left" valign="top">adamw_hf</td></tr><tr><td align="left" valign="top">Long Transformer RoBERTa</td><td align="left" valign="top">8</td><td align="left" valign="top">3e-05</td><td align="left" valign="top">0.01</td><td align="left" valign="top">adamw_hf</td></tr><tr><td align="left" valign="top">Long Transformer Biomedical-clinical</td><td align="left" valign="top">8</td><td align="left" valign="top">5e-05</td><td align="left" valign="top">0.01</td><td align="left" valign="top">adamw_hf</td></tr></tbody></table></table-wrap><table-wrap id="t6" position="float"><label>Table 6.</label><caption><p>Best hyperparameter values for the <italic>F</italic><sub>1</sub>-score optimization strategy.</p></caption><table id="table6" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Model</td><td align="left" valign="bottom">Batch size</td><td align="left" valign="bottom">Learning rate</td><td align="left" valign="bottom">Weight decay</td><td align="left" valign="bottom">Optimizer</td></tr></thead><tbody><tr><td align="left" valign="top">BETO</td><td align="left" valign="top">8</td><td align="left" valign="top">5e-05</td><td align="left" valign="top">0.001</td><td align="left" valign="top">adamw_torch</td></tr><tr><td align="left" valign="top">RoBERTa</td><td align="left" valign="top">16</td><td align="left" valign="top">5e-05</td><td align="left" valign="top">0.01</td><td align="left" valign="top">adafactor</td></tr><tr><td align="left" valign="top">RoBERTa-biomedical</td><td align="left" valign="top">16</td><td align="left" valign="top">5e-05</td><td align="left" valign="top">0.01</td><td align="left" valign="top">adamw_torch</td></tr><tr><td align="left" valign="top">RoBERTa-biomedical-clinical</td><td align="left" valign="top">8</td><td align="left" valign="top">3e-05</td><td align="left" valign="top">0.1</td><td align="left" valign="top">adamw_hf</td></tr><tr><td align="left" valign="top">bsc-bio</td><td align="left" valign="top">32</td><td align="left" valign="top">5e-05</td><td align="left" valign="top">0.1</td><td align="left" valign="top">adafactor</td></tr><tr><td align="left" valign="top">bsc-bio-ehr</td><td align="left" valign="top">8</td><td align="left" valign="top">2e-05</td><td align="left" valign="top">0.01</td><td align="left" valign="top">adamw_torch</td></tr><tr><td align="left" valign="top">Long Transformer RoBERTa</td><td align="left" valign="top">8</td><td align="left" valign="top">3e-05</td><td align="left" valign="top">0.01</td><td align="left" valign="top">adamw_hf</td></tr><tr><td align="left" valign="top">Long Transformer Biomedical-clinical</td><td align="left" valign="top">8</td><td align="left" valign="top">3e-05</td><td align="left" valign="top">0.001</td><td align="left" valign="top">adamw_hf</td></tr></tbody></table></table-wrap><table-wrap id="t7" position="float"><label>Table 7.</label><caption><p>Best hyperparameter values for the sensitivity optimization strategy.</p></caption><table id="table7" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Model</td><td align="left" valign="bottom">Batch size</td><td align="left" valign="bottom">Learning rate</td><td align="left" valign="bottom">Weight decay</td><td align="left" valign="bottom">Optimizer</td></tr></thead><tbody><tr><td align="left" valign="top">BETO</td><td align="left" valign="top">8</td><td align="left" valign="top">5e-05</td><td align="left" valign="top">0.1</td><td align="left" valign="top">adamw_torch</td></tr><tr><td align="left" valign="top">RoBERTa</td><td align="left" valign="top">8</td><td align="left" valign="top">2e-05</td><td align="left" valign="top">0.01</td><td align="left" valign="top">adafactor</td></tr><tr><td align="left" valign="top">RoBERTa-biomedical</td><td align="left" valign="top">16</td><td align="left" valign="top">2e-05</td><td align="left" valign="top">0.001</td><td align="left" valign="top">adamw_torch</td></tr><tr><td align="left" valign="top">RoBERTa-biomedical-clinical</td><td align="left" valign="top">32</td><td align="left" valign="top">5e-05</td><td align="left" valign="top">0.001</td><td align="left" valign="top">adamw_hf</td></tr><tr><td align="left" valign="top">bsc-bio</td><td align="left" valign="top">8</td><td align="left" valign="top">5e-05</td><td align="left" valign="top">0.1</td><td align="left" valign="top">adamw_hf</td></tr><tr><td align="left" valign="top">bsc-bio-ehr</td><td align="left" valign="top">8</td><td align="left" valign="top">2e-05</td><td align="left" valign="top">0.1</td><td align="left" valign="top">adamw_hf</td></tr><tr><td align="left" valign="top">Long Transformer RoBERTa</td><td align="left" valign="top">8</td><td align="left" valign="top">2e-05</td><td align="left" valign="top">0.1</td><td align="left" valign="top">adamw_torch</td></tr><tr><td align="left" valign="top">Long Transformer Biomedical-clinical</td><td align="left" valign="top">8</td><td align="left" valign="top">5e-05</td><td align="left" valign="top">0.001</td><td align="left" valign="top">adafactor</td></tr></tbody></table></table-wrap></sec><sec id="s2-4-3"><title>Phase 2: Final Training and Evaluation</title><p>Using the optimal hyperparameters identified in Phase 1, the pretrained models were subsequently fine-tuned, each according to a specific optimization strategy (AUC, <italic>F</italic><sub>1</sub>-score, or sensitivity). Given the pronounced class imbalance in the original training dataset, with a much larger proportion of majority class (label &#x201C;0&#x201D;) samples than minority class (label &#x201C;1&#x201D;), a reduced training dataset was created using undersampling. All minority class examples (n=1176) were retained, and 3 times that number were randomly sampled from the majority class (n=3528), yielding a training dataset of 4704 instances. To enable model tuning without compromising the final evaluation, 20% of the reduced training dataset was reserved as a validation dataset. Undersampling helps mitigate class imbalance by ensuring a more equitable representation of classes. This technique yields a manageable dataset size while maintaining a class distribution appropriate for the task, especially for detecting clinical symptoms in both classes. To preserve model generalizability, the test dataset was kept intact and fully isolated from the training process, enabling objective performance evaluation across the full dataset. Although undersampling can lead to the exclusion of some majority class instances, this trade-off was deemed acceptable to ensure an efficient and clinically relevant training process.</p><p>Alternative strategies to handle class imbalance were also considered during experimentation [<xref ref-type="bibr" rid="ref27">27</xref>]. Oversampling methods, such as SMOTE (synthetic minority oversampling technique), were discarded due to their incompatibility with text-based data, as synthetic samples generated in vector space tend to lack semantic coherence. We also tested generating synthetic minority-class documents using pretrained language models, but the results were often clinically implausible or grammatically inconsistent. Additionally, class weighting was implemented in early trials, but this approach led to unstable training dynamics and overfitting to the minority class, particularly in long-document models. Based on these observations, stratified undersampling was selected as the most effective and interpretable solution for our setting.</p></sec></sec></sec><sec id="s3" sec-type="results"><title>Results</title><p>This section presents and analyzes the results obtained on the test dataset for the models trained under the 3 previously described optimization strategies. The dataset was divided into training (80%), validation (20%), and test (20%) subsets to support model development and evaluation. <xref ref-type="table" rid="table8">Table 8</xref> shows the number of documents in each dataset partition. As previously described, an undersampling strategy was applied to the training and validation datasets to mitigate class imbalance.</p><table-wrap id="t8" position="float"><label>Table 8.</label><caption><p>Size and label distribution in the training, validation, and test datasets.</p></caption><table id="table8" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Dataset</td><td align="left" valign="bottom">Number of documents</td><td align="left" valign="bottom">Label &#x201C;0&#x201D;</td><td align="left" valign="bottom">Label &#x201C;1&#x201D;</td></tr></thead><tbody><tr><td align="left" valign="top">Original</td><td align="left" valign="top">15,304</td><td align="left" valign="top">13,853</td><td align="left" valign="top">1451</td></tr><tr><td align="left" valign="top">Training</td><td align="left" valign="top">3764</td><td align="left" valign="top">2823</td><td align="left" valign="top">941</td></tr><tr><td align="left" valign="top">Validation</td><td align="left" valign="top">940</td><td align="left" valign="top">705</td><td align="left" valign="top">235</td></tr><tr><td align="left" valign="top">Test</td><td align="left" valign="top">3032</td><td align="left" valign="top">2757</td><td align="left" valign="top">275</td></tr></tbody></table></table-wrap><p>To improve model robustness and ensure greater statistical validity, two strategies were implemented in the construction of the test dataset:</p><list list-type="order"><list-item><p>Ensuring patient-level separation between the training and test datasets. That is, test documents originated from patients not included during model training. This ensured that the test dataset contained entirely unseen data for the model. Satisfactory results under this condition would indicate strong generalization capability in detecting HFpEF symptoms.</p></list-item><list-item><p>Preserving the original class distribution in the test dataset (91% class &#x201C;0,&#x201D; 9% class &#x201C;1&#x201D;). This strategy allows evaluation under realistic conditions reflecting the natural class imbalance between negative and positive HFpEF cases.</p></list-item></list><p><xref ref-type="table" rid="table9">Tables 9</xref><xref ref-type="table" rid="table10"/>-<xref ref-type="table" rid="table11">11</xref> summarize the performance of the 6 selected language models trained under the 3 optimization strategies. The evaluation metrics indicate robust and consistent performance across models. A key observation is the consistently high AUC values across models, all surpassing the 0.80 threshold considered clinically significant [<xref ref-type="bibr" rid="ref28">28</xref>]. <xref ref-type="fig" rid="figure3">Figure 3</xref> displays the ROC curves of the models that achieved the best performance in terms of AUC.</p><p>Overall, the <italic>Long Transformer Biomedical-clinical</italic> model achieved the highest performance, highlighting that domain-adapted pretrained models generally outperform general-purpose counterparts in specialized clinical tasks. Moreover, models capable of processing extended input sequences exhibit superior suitability for clinical domains, where documents often exceed the standard 512-token limitation of conventional Transformer architectures.</p><p>As shown in <xref ref-type="fig" rid="figure4">Figure 4</xref>, analysis of token distributions across models revealed that the majority of clinical documents (approximately 85%&#x2010;95%, depending on the tokenizer) contain 512 tokens or fewer. However, a nonnegligible subset of documents (5%&#x2010;15%) exceeds this limit, underscoring the presence of longer narratives in the dataset. In standard Transformer-based models, these longer documents were truncated during processing due to sequence length limitations. Although this limitation may result in the loss of relevant contextual information, the models still achieved robust performance, indicating their effectiveness even when operating on truncated inputs. To better handle longer documents, Longformer-based models were used, enabling a maximum input length of 1024 tokens. This extended capacity allows for more complete document representations while preserving computational efficiency. Although only a small fraction of the dataset (1%&#x2010;2%) exceeded 1024 tokens, the increased sequence length enhanced contextual coverage for longer documents compared to standard Transformer models. The inclusion of Longformers in the experimental framework highlights their value in handling tasks involving lengthy and complex texts, particularly in clinical NLP, where capturing contextual nuances is crucial for effective classification.</p><p>In diagnostic applications, minimizing the false negative rate is critical to improving the model&#x2019;s ability to correctly identify positive cases and facilitate early disease detection. Accordingly, sensitivity was prioritized in one experimental condition by training a model specifically optimized for this metric. This approach increased the correct classification rate of positive cases, thereby reducing the likelihood of missing patients in need of further clinical evaluation. To evaluate the impact of each optimization strategy on model performance, confusion matrices were analyzed on the test dataset. <xref ref-type="fig" rid="figure5">Figure 5</xref> shows the confusion matrices of the best-performing models under each optimization strategy.</p><p>The trained Transformer-based models exhibited high sensitivity across all 3 optimization strategies, consistently achieving low false-negative rates. In particular, classification accuracy for the positive class exceeded 96% in all cases, highlighting the robustness of both the annotated corpus and the experimental design. These findings underscore not only the models&#x2019; effectiveness in detecting positive cases, but also the consistency and reliability of the adopted methodology. Confusion matrix analysis reveals subtle performance differences depending on the optimization strategy applied. Notably, the sensitivity-optimized strategy reduced false negatives to just 8, resulting in a 97% accuracy rate for positive case detection. However, this improvement came at the cost of increased false positives (95), illustrating the inherent trade-off between sensitivity and specificity. In contrast, the AUC and <italic>F</italic><sub>1</sub>-score strategies, which aim to balance accuracy across classes, significantly reduced false positives (35 and 37, respectively). Simultaneously, these strategies maintained relatively low false-negative counts (12 and 9, respectively).</p><table-wrap id="t9" position="float"><label>Table 9.</label><caption><p>Performance metrics obtained on the test dataset for all evaluated models under the AUC<sup><xref ref-type="table-fn" rid="table9fn1">a</xref></sup>-based optimization strategy.</p></caption><table id="table9" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Model</td><td align="left" valign="bottom">Sensitivity</td><td align="left" valign="bottom">Specificity</td><td align="left" valign="bottom"><italic>F</italic><sub>1</sub>-score</td><td align="left" valign="bottom">AUC</td></tr></thead><tbody><tr><td align="left" valign="top">BETO</td><td align="left" valign="top">0.920</td><td align="left" valign="top">0.960</td><td align="left" valign="top">0.959</td><td align="left" valign="top">0.940</td></tr><tr><td align="left" valign="top">RoBERTa</td><td align="left" valign="top">0.938</td><td align="left" valign="top">0.985</td><td align="left" valign="top">0.983</td><td align="left" valign="top">0.962</td></tr><tr><td align="left" valign="top">RoBERTa-biomedical</td><td align="left" valign="top">0.960</td><td align="left" valign="top">0.972</td><td align="left" valign="top">0.972</td><td align="left" valign="top">0.966</td></tr><tr><td align="left" valign="top">RoBERTa-biomedical-clinical</td><td align="left" valign="top">0.967</td><td align="left" valign="top">0.961</td><td align="left" valign="top">0.964</td><td align="left" valign="top">0.964</td></tr><tr><td align="left" valign="top">bsc-bio</td><td align="left" valign="top">0.945</td><td align="left" valign="top">0.980</td><td align="left" valign="top">0.977</td><td align="left" valign="top">0.963</td></tr><tr><td align="left" valign="top">bsc-bio-ehr</td><td align="left" valign="top">0.953</td><td align="left" valign="top">0.980</td><td align="left" valign="top">0.978</td><td align="left" valign="top">0.966</td></tr><tr><td align="left" valign="top">Long Transformer RoBERTa</td><td align="left" valign="top">0.960</td><td align="left" valign="top">0.952</td><td align="left" valign="top">0.957</td><td align="left" valign="top">0.956</td></tr><tr><td align="left" valign="top">Long Transformer Biomedical-clinical</td><td align="left" valign="top">0.956</td><td align="left" valign="top">0.987</td><td align="left" valign="top">0.985</td><td align="left" valign="top">0.971</td></tr></tbody></table><table-wrap-foot><fn id="table9fn1"><p><sup>a</sup>AUC: area under the curve.</p></fn></table-wrap-foot></table-wrap><table-wrap id="t10" position="float"><label>Table 10.</label><caption><p>Performance metrics obtained on the test dataset for all evaluated models under the <italic>F</italic><sub>1</sub>-score&#x2013;based optimization strategy.</p></caption><table id="table10" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Model</td><td align="left" valign="bottom">Sensitivity</td><td align="left" valign="bottom">Specificity</td><td align="left" valign="bottom"><italic>F</italic><sub>1</sub>-score</td><td align="left" valign="bottom">AUC</td></tr></thead><tbody><tr><td align="left" valign="top">BETO</td><td align="left" valign="top">0.949</td><td align="left" valign="top">0.935</td><td align="left" valign="top">0.942</td><td align="left" valign="top">0.942</td></tr><tr><td align="left" valign="top">RoBERTa</td><td align="left" valign="top">0.967</td><td align="left" valign="top">0.946</td><td align="left" valign="top">0.952</td><td align="left" valign="top">0.957</td></tr><tr><td align="left" valign="top">RoBERTa-biomedical</td><td align="left" valign="top">0.985</td><td align="left" valign="top">0.849</td><td align="left" valign="top">0.885</td><td align="left" valign="top">0.917</td></tr><tr><td align="left" valign="top">RoBERTa-biomedical-clinical</td><td align="left" valign="top">0.967</td><td align="left" valign="top">0.923</td><td align="left" valign="top">0.936</td><td align="left" valign="top">0.945</td></tr><tr><td align="left" valign="top">bsc-bio</td><td align="left" valign="top">0.956</td><td align="left" valign="top">0.971</td><td align="left" valign="top">0.971</td><td align="left" valign="top">0.963</td></tr><tr><td align="left" valign="top">bsc-bio-ehr</td><td align="left" valign="top">0.956</td><td align="left" valign="top">0.971</td><td align="left" valign="top">0.971</td><td align="left" valign="top">0.964</td></tr><tr><td align="left" valign="top">Long Transformer RoBERTa</td><td align="left" valign="top">0.920</td><td align="left" valign="top">0.987</td><td align="left" valign="top">0.981</td><td align="left" valign="top">0.954</td></tr><tr><td align="left" valign="top">Long Transformer Biomedical-clinical</td><td align="left" valign="top">0.987</td><td align="left" valign="top">0.987</td><td align="left" valign="top">0.985</td><td align="left" valign="top">0.987</td></tr></tbody></table></table-wrap><table-wrap id="t11" position="float"><label>Table 11.</label><caption><p>Performance metrics obtained on the test dataset for all evaluated models under the sensitivity-based optimization strategy.</p></caption><table id="table11" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Model</td><td align="left" valign="bottom">Sensitivity</td><td align="left" valign="bottom">Specificity</td><td align="left" valign="bottom"><italic>F</italic><sub>1</sub>-score</td><td align="left" valign="bottom">AUC<sup><xref ref-type="table-fn" rid="table11fn1">a</xref></sup></td></tr></thead><tbody><tr><td align="left" valign="top">BETO</td><td align="left" valign="top">0.938</td><td align="left" valign="top">0.955</td><td align="left" valign="top">0.957</td><td align="left" valign="top">0.947</td></tr><tr><td align="left" valign="top">RoBERTa</td><td align="left" valign="top">0.938</td><td align="left" valign="top">0.986</td><td align="left" valign="top">0.983</td><td align="left" valign="top">0.962</td></tr><tr><td align="left" valign="top">RoBERTa-biomedical</td><td align="left" valign="top">0.953</td><td align="left" valign="top">0.934</td><td align="left" valign="top">0.942</td><td align="left" valign="top">0.943</td></tr><tr><td align="left" valign="top">RoBERTa-biomedical-clinical</td><td align="left" valign="top">0.967</td><td align="left" valign="top">0.925</td><td align="left" valign="top">0.937</td><td align="left" valign="top">0.946</td></tr><tr><td align="left" valign="top">bsc-bio</td><td align="left" valign="top">0.985</td><td align="left" valign="top">0.943</td><td align="left" valign="top">0.952</td><td align="left" valign="top">0.964</td></tr><tr><td align="left" valign="top">bsc-bio-ehr</td><td align="left" valign="top">0.960</td><td align="left" valign="top">0.978</td><td align="left" valign="top">0.977</td><td align="left" valign="top">0.969</td></tr><tr><td align="left" valign="top">Long Transformer RoBERTa</td><td align="left" valign="top">0.979</td><td align="left" valign="top">0.956</td><td align="left" valign="top">0.968</td><td align="left" valign="top">0.968</td></tr><tr><td align="left" valign="top">Long Transformer Biomedical-clinical</td><td align="left" valign="top">0.971</td><td align="left" valign="top">0.922</td><td align="left" valign="top">0.935</td><td align="left" valign="top">0.946</td></tr></tbody></table><table-wrap-foot><fn id="table11fn1"><p><sup>a</sup>AUC: area under the curve.</p></fn></table-wrap-foot></table-wrap><fig position="float" id="figure3"><label>Figure 3.</label><caption><p>ROC curves for the test dataset, illustrating the performance of models optimized using 3 different hyperparameter optimization strategies. (<bold>A</bold>) Optimization based on AUC using the <italic>Longformer Biomedical-clinical</italic> model; (<bold>B</bold>) Optimization based on <italic>F</italic><sub>1</sub>-score using the same model; (<bold>C</bold>) Optimization based on sensitivity using the <italic>bsc-bio-ehr</italic> model. For each strategy, the model shown corresponds to the one that achieved the highest AUC on the test dataset.AUC: area under the curve; ROC: receiver operating characteristic.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v27i1e76433_fig03.png"/></fig><fig position="float" id="figure4"><label>Figure 4.</label><caption><p>Distribution of document lengths (in tokens) across the full dataset, computed using each model&#x2019;s tokenizer. The x-axis represents token count ranges, and the y-axis lists the tokenizers associated with the evaluated language models.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v27i1e76433_fig04.png"/></fig><fig position="float" id="figure5"><label>Figure 5.</label><caption><p>Confusion matrices showing classification results on the test dataset using 3 different hyperparameter optimization strategies: (<bold>A</bold>) AUC-based optimization using the <italic>Longformer Biomedical-Clinical</italic> model; (<bold>B</bold>) <italic>F</italic><sub>1</sub>-score&#x2013;based optimization using the same model; and (<bold>C</bold>) sensitivity-based optimization using the <italic>Longformer RoBERTa</italic> model. Each matrix shows the number of true positives, true negatives, false positives, and false negatives predicted by the respective model. These results illustrate the trade-offs introduced by each optimization criterion in terms of sensitivity and specificity. AUC: area under the curve.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v27i1e76433_fig05.png"/></fig></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Results</title><p>This study shows that the performance of Transformer-based models in HFpEF symptom detection can be substantially influenced by the chosen optimization strategy. Each approach targeting (AUC, <italic>F</italic><sub>1</sub>-score, or sensitivity) serves a distinct clinical and operational objective. Notably, the sensitivity-driven strategy proved particularly effective in minimizing false negatives, a critical consideration in early disease detection scenarios. This highlights the value of aligning model training objectives with specific clinical priorities when developing NLP tools for medical applications.</p><p>It should be noted that direct comparisons with related studies are not feasible, as each uses different datasets. Nevertheless, the strength of the results confirms the validity of the experimental design and the robustness of the trained models under the proposed optimization strategies, as reflected in the evaluation metrics. Additionally, the involvement of cardiology specialists provides expert validation and qualitative support for these findings.</p><p>Beyond retrospective evaluation, two real-world integration scenarios are envisioned: (1) batch-mode screening, where the model is periodically applied to existing EHR data to prioritize patients for further testing, and (2) real-time alerting, where symptom mentions in clinical notes trigger nonbinding alerts to the clinician. In either case, the system would function as a clinical decision support tool. To mitigate false positives, adjustable probability thresholds and explanatory interfaces that highlight the relevant text segments are recommended. These strategies align with current frameworks for trustworthy and transparent AI in health care.</p></sec><sec id="s4-2"><title>Error Analysis</title><p>A qualitative error analysis was performed from a clinical perspective, focusing on both false negative and false positive cases. Special attention was given to false negatives to identify the reasons why the models predicted the absence of the symptom in a document (label &#x201C;0&#x201D;) when, in reality, it had been classified as positive, meaning that the document contained information indicating the presence of the symptom. We selected the misclassified documents from the 3 models with the lowest false negative rates (see <xref ref-type="fig" rid="figure5">Figure 5</xref>).</p><p><xref ref-type="other" rid="box1">Textbox 1</xref> presents a false negative example, in which all models predicted the case as negative, despite human annotation indicating the presence of the symptom. However, a review of the annotation guidelines revealed that the annotator&#x2019;s interpretation relied on clinical reasoning that, while valid, extended beyond the defined annotation criteria. The text includes terms such as <italic>&#x201C;cardiomegalia, signos de hipertensi&#x00F3;n pulmonar precapilar y poscapilar con infiltrados de edema intersticial alveolar bilaterales&#x201D;</italic> (&#x201C;cardiomegaly, signs of pre- and post-capillary pulmonary hypertension with bilateral interstitial-alveolar edema infiltrates&#x201D;), which can be associated with HFpEF. However, the guidelines did not explicitly state that these findings alone were sufficient for a positive classification without additional diagnostic information or clinical context. In this regard, the model&#x2019;s prediction strictly adhered to the established criteria, whereas the human annotation reflected a broader interpretation based on implicit clinical knowledge. This example highlights the inherent complexity of symptom detection and the subtle distinction between rule-based annotation and clinically contextual judgment. Despite these occasional discrepancies, the models have demonstrated a high level of consistency in applying the predefined criteria. Future revisions of the annotation process could explore potential adjustments to better align guideline application with the expected clinical reasoning for this task.</p><p>Conversely, <xref ref-type="other" rid="box2">Textbox 2</xref> shows an example of a false positive error. In this case, the models incorrectly predicted the presence of HFpEF symptoms in a document explicitly indicating their absence. Although the key phrase <italic>&#x201C;no existe da&#x00F1;o renal ni signos de ICC&#x201D;</italic> (&#x201C;no kidney damage or signs of CHF&#x201D;) clearly indicated the negative class, the models failed to interpret the negation correctly in this context. Such errors are a well-known challenge in clinical NLP, where negation is often expressed using diverse and context-dependent linguistic patterns. Notably, all 3 models produced the same misclassification, indicating that the difficulty in interpreting negation structures is model-independent.</p><p>Despite these specific instances, the overall performance of the models has been consistent across most evaluated cases. However, future improvements could focus on refining negation handling through domain-adapted preprocessing pipelines or model adaptations that explicitly account for linguistic cues. In particular, the integration of rule-based algorithms, enhanced tokenization schemes, or syntactic features could support more accurate interpretation of negated or speculative statements. These enhancements represent promising avenues for improving model robustness in real-world clinical applications.</p><boxed-text id="box1"><title> Example of a misclassified document (false negative). Document manually labeled as positive and classified as negative by all three classifiers. The English translation is included to facilitate reading.</title><p><bold>Spanish:</bold></p><p><italic>Ecg: en fa a unos 135-140 lpm, e isquemia subendocardica lateral.cr 1.6, probnp 6869, tnt 20, iones normales, hb 10.7g, inr tp 2.8.bq planta: cr 1.40mg, ggtp 84u, urea 68mg, iones normales, hierro 24 mcg, ab 3.3g, mg 1.52 mg.pco2 63, ph 7.33, bicarbonato 33.orina normal.rx de torax: cardiomegalia, signos de hipertension pulmonar precapilar y poscapilar con infiltrados de edema intersticioalveolar bilaterales.</italic></p><p><bold>English:</bold></p><p>ECG: atrial fibrillation at approximately 135-140 bpm, with lateral subendocardial ischemia. CR 1.6, ProBNP 6869, TnT 20, normal ions, Hb 10.7 g, INR/TP 2.8. Ward: CR 1.40 mg, GGTP 84 U, urea 68 mg, normal ions, iron 24 mcg, AB 3.3 g, Mg 1.52 mg. PCO2 63, pH 7.33, bicarbonate 33. Urinalysis: normal. Chest X-ray: cardiomegaly, signs of pre- and postcapillary pulmonary hypertension, with bilateral interstitial-alveolar edema infiltrates.</p></boxed-text><boxed-text id="box2"><title> Example of a misclassified document (false positive). Document manually labeled as negative and classified as positive by all 3 classifiers. The English translation is included to facilitate reading.</title><p><bold>Spanish:</bold></p><p><italic>Hoy tiene prevista tercera infusi&#x00F3;n del f&#x00E1;rmaco. peso 49.5 kg seg&#x00FA;n lo recogido hoy. cl&#x00ED;nicamente bien, salvo edemas en ambos mal&#x00E9;olos, creo que en relaci&#x00F3;n al patisiran pues no existe da&#x00F1;o renal ni signos de icc.ef. beg. acr con cor r&#x00ED;tmico, sin soplos audibles, con bmv. abdomen sin masas ni megalias. mmii cion edemas que dejan f&#x00F3;ve a nivel maleolar. no signos de tvp.plan:- pr&#x00F3;xima cita 10 de julio- controles anal&#x00ED;ticos los de nefrolog&#x00ED;a por ahora (niveles de tacrolimus en rango en &#x00FA;ltimo control).</italic></p><p><bold>English:</bold></p><p>Today, the third infusion of the drug is scheduled. Weight: 49.5 kg as recorded today. Clinically well, except for edema in both malleoli, which I believe is related to patisiran, as there is no renal damage or signs of heart failure. Physical examination: well-nourished and in good general condition. Peripheral circulation: rhythmic heart sounds, no audible murmurs, with good bilateral breath sounds. Abdomen: no masses or organ enlargement. Lower limbs: with edema that leaves a pit at the malleolar level. No signs of deep vein thrombosis. Plan: next appointment: July 10. Laboratory tests: nephrology monitoring for now (tacrolimus levels within range in the last test).</p></boxed-text></sec><sec id="s4-3"><title>Clinical Implications of Misclassification Errors</title><p>From a clinical standpoint, accurately identifying patients with suspected HFpEF is critical for ensuring appropriate management. Classification errors may result in the omission of key diagnostic procedures or delays in initiating disease-modifying therapies. In particular, false negatives represent a major concern, as they may leave early-stage patients undiagnosed and untreated. Conversely, false positives may prompt unnecessary diagnostic procedures, increasing both clinical workload and health care costs. Therefore, tuning strategies that prioritize sensitivity while preserving acceptable specificity are essential for the effective deployment of these models in clinical settings.</p></sec><sec id="s4-4"><title>Limitations</title><p>This study has several limitations that should be acknowledged. First, all clinical data were obtained from a single hospital in Spain, using one specific EHR system. As a result, the trained models reflect documentation practices, linguistic patterns, and institutional conventions specific to that context. This may limit the generalizability of the findings to other health care settings, regions, or EHR platforms. Second, no external validation was performed on datasets from other institutions or countries. Although the models demonstrated strong internal performance, further evaluation on external corpora will be essential to assess their robustness and adaptability to different clinical environments and documentation styles. As highlighted in recent studies, external validation is a critical step to ensure the reliability and applicability of NLP models across diverse health care systems [<xref ref-type="bibr" rid="ref29">29</xref>]. Despite these limitations, this work provides a valuable foundation for developing NLP systems tailored to Spanish-language clinical narratives and highlights the feasibility of symptom detection in the context of cardiac amyloidosis. Future research should focus on validating and refining these models in multicenter, cross-national settings to enhance their clinical relevance.</p></sec><sec id="s4-5"><title>Discussion of Ethical Considerations</title><p>This study was conducted using anonymized clinical data in accordance with the General Data Protection Regulation (regulation [EU] 2016/679) and Spanish data protection laws (<italic>Ley Org&#x00E1;nica 3/2018 de Protecci&#x00F3;n de Datos Personales y garant&#x00ED;a de los derechos digitales</italic>). Data use was approved by the institutional ethics committee. Looking forward, any clinical deployment of the proposed AI system would need to comply with the EU Artificial Intelligence Act and, depending on its intended function, may also fall under the scope of the Medical Device Regulation (regulation [EU] 2017/745). Consequently, future implementations will require robust validation, human oversight mechanisms, and explainability features to ensure safe and ethical integration into clinical practice.</p></sec><sec id="s4-6"><title>Conclusions</title><p>This study proposed an experimental framework for optimizing Transformer-based language models to automatically detect clinical documents suggesting the presence of HFpEF. Our findings underscore the importance of hyperparameter tuning, as optimal configurations directly influence model performance according to the prioritized evaluation metric. Specifically, to minimize false negatives, optimization should prioritize model sensitivity. This approach allows model performance to be adapted to varying clinical and operational requirements, maintaining a suitable balance between sensitivity and specificity depending on the context of use.</p><p>It has been demonstrated that using of domain-specific pretrained language models significantly improves adaptability and transferability for specialized clinical tasks. This advantage lies in the ability of context-aware models to better capture linguistic patterns, domain-specific terminology, and semantic relationships, thereby enhancing task-specific performance. Moreover, the capacity to process longer sequences during fine-tuning and inference is especially beneficial in the clinical domain. Medical documents are often lengthy and rich in interrelated concepts, which may be lost when contextual processing is constrained. Therefore, using models designed for extended input sequences is essential for comprehensive and accurate clinical text analysis.</p><p>Having validated our experimental approach and confirmed model reliability for HFpEF detection, future work will focus on developing and fine-tuning models capable of comprehensively identifying all symptoms associated with cardiac amyloidosis. To this end, the top-performing models from this study will be retrained on manually annotated corpora to incorporate the full range of clinically relevant manifestations. This process will improve the system&#x2019;s ability to detect textual patterns indicative of the disease with increased precision and sensitivity. The ultimate aim is to implement an automated screening system based on EHR analysis to support the early identification of patients with suspected cardiac amyloidosis. This approach may optimize diagnostic workflows, improve clinical decision-making, and enable more personalized treatment strategies for a condition that remains difficult to detect at early stages.</p></sec></sec></body><back><ack><p>We thank Mar&#x00ED;a Isabel Rengel, IT Specialist from the Team IT SAS Huelva, for her valuable support in extracting data from clinical records and curating clinical episodes. Her work in identifying relevant terms related to potential signs of amyloidosis has been essential to the quality of our dataset. This project was funded by the Institute of Health Carlos III, Ministry of Science, Innovation and Universities, Spanish Government (grant number PI20/01485).</p></ack><notes><sec><title>Data Availability</title><p>Data will be made available on request.</p></sec></notes><fn-group><fn fn-type="con"><p>JM: conceptualization, data curation, formal analysis, investigation, methodology, software, supervision, validation, visualization, writing &#x2013; original draft, writing &#x2013; review &#x0026; editing. VP: conceptualization, data curation, formal analysis, investigation, methodology, software, validation, writing &#x2013; original draft, writing &#x2013; review &#x0026; editing. AM: conceptualization, data curation, investigation, resources, validation, writing &#x2013; original draft, writing &#x2013; review &#x0026; editing. MJM: conceptualization, data curation, writing &#x2013; original draft. MdlV: conceptualization, data curation, writing &#x2013; original draft.</p></fn><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">AI</term><def><p>artificial intelligence</p></def></def-item><def-item><term id="abb2">ATTR-CM</term><def><p>transthyretin cardiac amyloidosis</p></def></def-item><def-item><term id="abb3">AUC</term><def><p>area under the curve</p></def></def-item><def-item><term id="abb4">AUROC</term><def><p>area under the receiver operating characteristic curve</p></def></def-item><def-item><term id="abb5">EHR</term><def><p>electronic health record</p></def></def-item><def-item><term id="abb6">HF</term><def><p>heart failure</p></def></def-item><def-item><term id="abb7">HFpEF</term><def><p>heart failure with preserved ejection fraction</p></def></def-item><def-item><term id="abb8">NLP</term><def><p>natural language processing</p></def></def-item><def-item><term id="abb9">ROC</term><def><p>receiver operating characteristic</p></def></def-item><def-item><term id="abb10">RQ</term><def><p>research question</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Borlaug</surname><given-names>BA</given-names> </name><name name-style="western"><surname>Sharma</surname><given-names>K</given-names> </name><name name-style="western"><surname>Shah</surname><given-names>SJ</given-names> </name><name name-style="western"><surname>Ho</surname><given-names>JE</given-names> </name></person-group><article-title>Heart failure with preserved ejection fraction: JACC scientific statement</article-title><source>J Am Coll Cardiol</source><year>2023</year><month>05</month><day>9</day><volume>81</volume><issue>18</issue><fpage>1810</fpage><lpage>1834</lpage><pub-id pub-id-type="doi">10.1016/j.jacc.2023.01.049</pub-id><pub-id pub-id-type="medline">37137592</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Yamamoto</surname><given-names>H</given-names> </name><name name-style="western"><surname>Yokochi</surname><given-names>T</given-names> </name></person-group><article-title>Transthyretin cardiac amyloidosis: an update on diagnosis and treatment</article-title><source>ESC Heart Fail</source><year>2019</year><month>12</month><volume>6</volume><issue>6</issue><fpage>1128</fpage><lpage>1139</lpage><pub-id pub-id-type="doi">10.1002/ehf2.12518</pub-id><pub-id pub-id-type="medline">31553132</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Haug</surname><given-names>CJ</given-names> </name><name name-style="western"><surname>Drazen</surname><given-names>JM</given-names> </name></person-group><article-title>Artificial intelligence and machine learning in clinical medicine, 2023</article-title><source>N Engl J Med</source><year>2023</year><month>03</month><day>30</day><volume>388</volume><issue>13</issue><fpage>1201</fpage><lpage>1208</lpage><pub-id pub-id-type="doi">10.1056/NEJMra2302038</pub-id><pub-id pub-id-type="medline">36988595</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Reading Turchioe</surname><given-names>M</given-names> </name><name name-style="western"><surname>Volodarskiy</surname><given-names>A</given-names> </name><name name-style="western"><surname>Pathak</surname><given-names>J</given-names> </name><name name-style="western"><surname>Wright</surname><given-names>DN</given-names> </name><name name-style="western"><surname>Tcheng</surname><given-names>JE</given-names> </name><name name-style="western"><surname>Slotwiner</surname><given-names>D</given-names> </name></person-group><article-title>Systematic review of current natural language processing methods and applications in cardiology</article-title><source>Heart</source><year>2022</year><month>05</month><day>25</day><volume>108</volume><issue>12</issue><fpage>909</fpage><lpage>916</lpage><pub-id pub-id-type="doi">10.1136/heartjnl-2021-319769</pub-id><pub-id pub-id-type="medline">34711662</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Garc&#x00ED;a Subies</surname><given-names>G</given-names> </name><name name-style="western"><surname>Barbero Jim&#x00E9;nez</surname><given-names>&#x00C1;</given-names> </name><name name-style="western"><surname>Mart&#x00ED;nez Fern&#x00E1;ndez</surname><given-names>P</given-names> </name></person-group><article-title>A comparative analysis of Spanish Clinical encoder-based models on NER and classification tasks</article-title><source>J Am Med Inform Assoc</source><year>2024</year><month>09</month><day>1</day><volume>31</volume><issue>9</issue><fpage>2137</fpage><lpage>2146</lpage><pub-id pub-id-type="doi">10.1093/jamia/ocae054</pub-id><pub-id pub-id-type="medline">38489543</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Miranda-Escalada</surname><given-names>A</given-names> </name><name name-style="western"><surname>Farr&#x00E9;-Maduell</surname><given-names>E</given-names> </name><name name-style="western"><surname>Krallinger</surname><given-names>M</given-names> </name></person-group><article-title>NUBes: a Spanish corpus of negation and uncertainty in the biomedical domain</article-title><year>2020</year><month>05</month><access-date>2025-07-12</access-date><conf-name>Twelfth Language Resources and Evaluation Conference</conf-name><conf-loc>Marseille, France</conf-loc><comment><ext-link ext-link-type="uri" xlink:href="https://aclanthology.org/2020.lrec-1.708">https://aclanthology.org/2020.lrec-1.708</ext-link></comment></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Mujtaba</surname><given-names>G</given-names> </name><name name-style="western"><surname>Shuib</surname><given-names>L</given-names> </name><name name-style="western"><surname>Idris</surname><given-names>N</given-names> </name><etal/></person-group><article-title>Clinical text classification research trends: systematic literature review and open issues</article-title><source>Expert Syst Appl</source><year>2019</year><month>02</month><volume>116</volume><fpage>494</fpage><lpage>520</lpage><pub-id pub-id-type="doi">10.1016/j.eswa.2018.09.034</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Adejumo</surname><given-names>P</given-names> </name><name name-style="western"><surname>Thangaraj</surname><given-names>PM</given-names> </name><name name-style="western"><surname>Dhingra</surname><given-names>LS</given-names> </name><etal/></person-group><article-title>Natural language processing of clinical documentation to assess functional status in patients with heart failure</article-title><source>JAMA Netw Open</source><year>2024</year><month>11</month><day>4</day><volume>7</volume><issue>11</issue><fpage>e2443925</fpage><pub-id pub-id-type="doi">10.1001/jamanetworkopen.2024.43925</pub-id><pub-id pub-id-type="medline">39509128</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Liu</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Tan</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>Z</given-names> </name><etal/></person-group><article-title>A natural language processing-based approach for early detection of heart failure onset using electronic health records</article-title><source>medRxiv</source><comment>Preprint posted online on  Apr 6, 2025</comment><pub-id pub-id-type="doi">10.1101/2025.04.04.25325211</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Marti-Castellote</surname><given-names>PM</given-names> </name><name name-style="western"><surname>Reeder</surname><given-names>C</given-names> </name><name name-style="western"><surname>Claggett</surname><given-names>B</given-names> </name><etal/></person-group><article-title>Natural language processing to adjudicate heart failure hospitalizations in global clinical trials</article-title><source>Circ Heart Fail</source><year>2025</year><month>01</month><volume>18</volume><issue>1</issue><fpage>e012514</fpage><pub-id pub-id-type="doi">10.1161/CIRCHEARTFAILURE.124.012514</pub-id><pub-id pub-id-type="medline">39549261</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ahmad</surname><given-names>FS</given-names> </name><name name-style="western"><surname>Luo</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Wehbe</surname><given-names>RM</given-names> </name><name name-style="western"><surname>Thomas</surname><given-names>JD</given-names> </name><name name-style="western"><surname>Shah</surname><given-names>SJ</given-names> </name></person-group><article-title>Advances in machine learning approaches to heart failure with preserved ejection fraction</article-title><source>Heart Fail Clin</source><year>2022</year><month>04</month><volume>18</volume><issue>2</issue><fpage>287</fpage><lpage>300</lpage><pub-id pub-id-type="doi">10.1016/j.hfc.2021.12.002</pub-id><pub-id pub-id-type="medline">35341541</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Houssein</surname><given-names>EH</given-names> </name><name name-style="western"><surname>Mohamed</surname><given-names>RE</given-names> </name><name name-style="western"><surname>Ali</surname><given-names>AA</given-names> </name></person-group><article-title>Heart disease risk factors detection from electronic health records using advanced NLP and deep learning techniques</article-title><source>Sci Rep</source><year>2023</year><month>05</month><day>3</day><volume>13</volume><issue>1</issue><fpage>7173</fpage><pub-id pub-id-type="doi">10.1038/s41598-023-34294-6</pub-id><pub-id pub-id-type="medline">37138014</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Houssein</surname><given-names>EH</given-names> </name><name name-style="western"><surname>Mohamed</surname><given-names>RE</given-names> </name><name name-style="western"><surname>Hu</surname><given-names>G</given-names> </name><name name-style="western"><surname>Ali</surname><given-names>AA</given-names> </name></person-group><article-title>Adapting transformer-based language models for heart disease detection and risk factors extraction</article-title><source>J Big Data</source><year>2024</year><volume>11</volume><issue>1</issue><pub-id pub-id-type="doi">10.1186/s40537-024-00903-y</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Fan</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Mamouei</surname><given-names>M</given-names> </name><name name-style="western"><surname>Li</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Rao</surname><given-names>S</given-names> </name><name name-style="western"><surname>Rahimi</surname><given-names>K</given-names> </name></person-group><article-title>Identification of heart failure subtypes using transformer-based deep learning modelling: a population-based study of 379,108 individuals</article-title><source>EBioMedicine</source><year>2025</year><month>04</month><volume>114</volume><fpage>105657</fpage><pub-id pub-id-type="doi">10.1016/j.ebiom.2025.105657</pub-id><pub-id pub-id-type="medline">40112740</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Neves</surname><given-names>M</given-names> </name><name name-style="western"><surname>&#x0160;eva</surname><given-names>J</given-names> </name></person-group><article-title>An extensive review of tools for manual annotation of documents</article-title><source>Brief Bioinform</source><year>2021</year><month>01</month><day>18</day><volume>22</volume><issue>1</issue><fpage>146</fpage><lpage>163</lpage><pub-id pub-id-type="doi">10.1093/bib/bbz130</pub-id><pub-id pub-id-type="medline">31838514</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="web"><source>Prodigy: an annotation tool for AI, machine learning &#x0026; NLP</source><access-date>2025-07-12</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://prodi.gy/">https://prodi.gy/</ext-link></comment></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Fields</surname><given-names>J</given-names> </name><name name-style="western"><surname>Chovanec</surname><given-names>K</given-names> </name><name name-style="western"><surname>Madiraju</surname><given-names>P</given-names> </name></person-group><article-title>A survey of text classification with transformers: how wide? How large? How long? How accurate? How expensive? How safe?</article-title><source>IEEE Access</source><year>2024</year><volume>12</volume><fpage>6518</fpage><lpage>6531</lpage><pub-id pub-id-type="doi">10.1109/ACCESS.2024.3349952</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Minaee</surname><given-names>S</given-names> </name><name name-style="western"><surname>Kalchbrenner</surname><given-names>N</given-names> </name><name name-style="western"><surname>Cambria</surname><given-names>E</given-names> </name><name name-style="western"><surname>Nikzad</surname><given-names>N</given-names> </name><name name-style="western"><surname>Chenaghlu</surname><given-names>M</given-names> </name><name name-style="western"><surname>Gao</surname><given-names>J</given-names> </name></person-group><article-title>Deep learning-based text classification: a comprehensive review</article-title><source>ACM Comput Surv</source><year>2022</year><pub-id pub-id-type="doi">10.1145/3439726</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Min</surname><given-names>B</given-names> </name><name name-style="western"><surname>Ross</surname><given-names>H</given-names> </name><name name-style="western"><surname>Sulem</surname><given-names>E</given-names> </name><etal/></person-group><article-title>Recent advances in natural language processing via large pre-trained language models: a survey</article-title><source>ACM Comput Surv</source><year>2024</year><month>02</month><day>29</day><volume>56</volume><issue>2</issue><fpage>1</fpage><lpage>40</lpage><pub-id pub-id-type="doi">10.1145/3605943</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Nerella</surname><given-names>S</given-names> </name><name name-style="western"><surname>Bandyopadhyay</surname><given-names>S</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>J</given-names> </name><etal/></person-group><article-title>Transformers and large language models in healthcare: a review</article-title><source>Artif Intell Med</source><year>2024</year><month>08</month><volume>154</volume><fpage>102900</fpage><pub-id pub-id-type="doi">10.1016/j.artmed.2024.102900</pub-id><pub-id pub-id-type="medline">38878555</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Beltagy</surname><given-names>I</given-names> </name><name name-style="western"><surname>Peters</surname><given-names>ME</given-names> </name><name name-style="western"><surname>Cohan</surname><given-names>A</given-names> </name></person-group><article-title>Longformer: the long-document transformer</article-title><source>arXiv</source><comment>Preprint posted online on  Apr 10, 2020</comment><pub-id pub-id-type="doi">10.48550/arXiv.2004.05150</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Ca&#x00F1;ete</surname><given-names>J</given-names> </name><name name-style="western"><surname>Chaperon</surname><given-names>G</given-names> </name><name name-style="western"><surname>Fuentes</surname><given-names>R</given-names> </name><name name-style="western"><surname>Ho</surname><given-names>JH</given-names> </name><name name-style="western"><surname>Kang</surname><given-names>H</given-names> </name><name name-style="western"><surname>P&#x00E9;rez</surname><given-names>J</given-names> </name></person-group><article-title>Spanish pre-trained BERT model and evaluation data</article-title><comment>Preprint posted online on  Aug 6, 2020</comment><pub-id pub-id-type="doi">10.48550/arXiv.2308.02976</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Liu</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Ott</surname><given-names>M</given-names> </name><name name-style="western"><surname>Goyal</surname><given-names>N</given-names> </name><name name-style="western"><surname>Du</surname><given-names>J</given-names> </name></person-group><article-title>RoBERTa: a robustly optimized BERT pretraining approach</article-title><source>arXiv</source><comment>Preprint posted online on  Jul 26, 2019</comment><pub-id pub-id-type="doi">10.48550/arXiv.1907.11692</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Guti&#x00E9;rrez-Fandi&#x00F1;o</surname><given-names>A</given-names> </name><name name-style="western"><surname>Armengol-Estap&#x00E9;</surname><given-names>J</given-names> </name><name name-style="western"><surname>P&#x00E0;mies</surname><given-names>M</given-names> </name><name name-style="western"><surname>Llop-Palao</surname><given-names>J</given-names> </name><name name-style="western"><surname>Silveira-Ocampo</surname><given-names>J</given-names> </name><etal/></person-group><article-title>MarIA: Spanish language models</article-title><source>arXiv</source><comment>Preprint posted online on  Jul 15, 2022</comment><pub-id pub-id-type="doi">10.26342/2022-68-3</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Carrino</surname><given-names>CP</given-names> </name><name name-style="western"><surname>Armengol-Estap&#x00E9;</surname><given-names>J</given-names> </name><name name-style="western"><surname>Guti&#x00E9;rrez-Fandi&#x00F1;o</surname><given-names>A</given-names> </name><name name-style="western"><surname>Llop-Palao</surname><given-names>J</given-names> </name><name name-style="western"><surname>P&#x00E0;mies</surname><given-names>M</given-names> </name><etal/></person-group><article-title>Biomedical and clinical language models for Spanish: on the benefits of domain-specific pretraining in a mid-resource scenario</article-title><source>arXiv</source><comment>Preprint posted online on  Sep 8, 2021</comment><pub-id pub-id-type="doi">10.48550/arXiv.2109.03570</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Carrino</surname><given-names>CP</given-names> </name><name name-style="western"><surname>Llop</surname><given-names>J</given-names> </name><name name-style="western"><surname>P&#x00E0;mies</surname><given-names>M</given-names> </name><etal/></person-group><article-title>Pretrained biomedical language models for clinical NLP in Spanish</article-title><year>2022</year><month>05</month><day>26</day><conf-name>Proceedings of the 21st Workshop on Biomedical Language Processing</conf-name><conf-loc>Dublin, Ireland</conf-loc><fpage>193</fpage><lpage>199</lpage><pub-id pub-id-type="doi">10.18653/v1/2022.bionlp-1.19</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lu</surname><given-names>H</given-names> </name><name name-style="western"><surname>Ehwerhemuepha</surname><given-names>L</given-names> </name><name name-style="western"><surname>Rakovski</surname><given-names>C</given-names> </name></person-group><article-title>A comparative study on deep learning models for text classification of unstructured medical notes with various levels of class imbalance</article-title><source>BMC Med Res Methodol</source><year>2022</year><month>07</month><day>2</day><volume>22</volume><issue>1</issue><fpage>181</fpage><pub-id pub-id-type="doi">10.1186/s12874-022-01665-y</pub-id><pub-id pub-id-type="medline">35780100</pub-id></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>&#x00C7;orbac&#x0131;o&#x011F;lu</surname><given-names>&#x015E;K</given-names> </name><name name-style="western"><surname>Aksel</surname><given-names>G</given-names> </name></person-group><article-title>Receiver operating characteristic curve analysis in diagnostic accuracy studies: a guide to interpreting the area under the curve value</article-title><source>Turk J Emerg Med</source><year>2023</year><volume>23</volume><issue>4</issue><fpage>195</fpage><lpage>198</lpage><pub-id pub-id-type="doi">10.4103/tjem.tjem_182_23</pub-id><pub-id pub-id-type="medline">38024184</pub-id></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wyles</surname><given-names>CC</given-names> </name><name name-style="western"><surname>Fu</surname><given-names>S</given-names> </name><name name-style="western"><surname>Odum</surname><given-names>SL</given-names> </name><etal/></person-group><article-title>External validation of natural language processing algorithms to extract common data elements in THA operative notes</article-title><source>J Arthroplasty</source><year>2023</year><month>10</month><volume>38</volume><issue>10</issue><fpage>2081</fpage><lpage>2084</lpage><pub-id pub-id-type="doi">10.1016/j.arth.2022.10.031</pub-id><pub-id pub-id-type="medline">36280160</pub-id></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Supplemental materials.</p><media xlink:href="jmir_v27i1e76433_app1.pdf" xlink:title="PDF File, 43 KB"/></supplementary-material></app-group></back></article>