<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article" dtd-version="2.0">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JMIR</journal-id>
      <journal-id journal-id-type="nlm-ta">J Med Internet Res</journal-id>
      <journal-title>Journal of Medical Internet Research</journal-title>
      <issn pub-type="epub">1438-8871</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
      <article-id pub-id-type="publisher-id">v27i1e66910</article-id>
      <article-id pub-id-type="pmid">39946687</article-id>
      <article-id pub-id-type="doi">10.2196/66910</article-id>
      <article-categories>
        <subj-group subj-group-type="heading">
          <subject>Original Paper</subject>
        </subj-group>
        <subj-group subj-group-type="article-type">
          <subject>Original Paper</subject>
        </subj-group>
      </article-categories>
      <title-group>
        <article-title>Using Structured Codes and Free-Text Notes to Measure Information Complementarity in Electronic Health Records: Feasibility and Validation Study</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="editor">
          <name>
            <surname>Tsafnat</surname>
            <given-names>Guy</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Remy</surname>
            <given-names>François</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Van Es</surname>
            <given-names>Bram</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Liu</surname>
            <given-names>Yuxi</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib id="contrib1" contrib-type="author" corresp="yes">
          <name name-style="western">
            <surname>Seinen</surname>
            <given-names>Tom M</given-names>
          </name>
          <degrees>MSc</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <address>
            <institution>Department of Medical Informatics</institution>
            <institution>Erasmus University Medical Center</institution>
            <addr-line>Dr. Molewaterplein 40</addr-line>
            <addr-line>Rotterdam, 3015 GD</addr-line>
            <country>Netherlands</country>
            <phone>31 010 7044122</phone>
            <email>t.seinen@erasmusmc.nl</email>
          </address>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-5369-8260</ext-link>
        </contrib>
        <contrib id="contrib2" contrib-type="author">
          <name name-style="western">
            <surname>Kors</surname>
            <given-names>Jan A</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-4929-026X</ext-link>
        </contrib>
        <contrib id="contrib3" contrib-type="author">
          <name name-style="western">
            <surname>van Mulligen</surname>
            <given-names>Erik M</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-1377-9386</ext-link>
        </contrib>
        <contrib id="contrib4" contrib-type="author">
          <name name-style="western">
            <surname>Rijnbeek</surname>
            <given-names>Peter R</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-0621-1979</ext-link>
        </contrib>
      </contrib-group>
      <aff id="aff1">
        <label>1</label>
        <institution>Department of Medical Informatics</institution>
        <institution>Erasmus University Medical Center</institution>
        <addr-line>Rotterdam</addr-line>
        <country>Netherlands</country>
      </aff>
      <author-notes>
        <corresp>Corresponding Author: Tom M Seinen <email>t.seinen@erasmusmc.nl</email></corresp>
      </author-notes>
      <pub-date pub-type="collection">
        <year>2025</year>
      </pub-date>
      <pub-date pub-type="epub">
        <day>13</day>
        <month>2</month>
        <year>2025</year>
      </pub-date>
      <volume>27</volume>
      <elocation-id>e66910</elocation-id>
      <history>
        <date date-type="received">
          <day>4</day>
          <month>10</month>
          <year>2024</year>
        </date>
        <date date-type="rev-request">
          <day>2</day>
          <month>11</month>
          <year>2024</year>
        </date>
        <date date-type="rev-recd">
          <day>22</day>
          <month>11</month>
          <year>2024</year>
        </date>
        <date date-type="accepted">
          <day>23</day>
          <month>11</month>
          <year>2024</year>
        </date>
      </history>
      <copyright-statement>©Tom M Seinen, Jan A Kors, Erik M van Mulligen, Peter R Rijnbeek. Originally published in the Journal of Medical Internet Research (https://www.jmir.org), 13.02.2025.</copyright-statement>
      <copyright-year>2025</copyright-year>
      <license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
        <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (https://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in the Journal of Medical Internet Research (ISSN 1438-8871), is properly cited. The complete bibliographic information, a link to the original publication on https://www.jmir.org/, as well as this copyright and license information must be included.</p>
      </license>
      <self-uri xlink:href="https://www.jmir.org/2025/1/e66910" xlink:type="simple"/>
      <abstract>
        <sec sec-type="background">
          <title>Background</title>
          <p>Electronic health records (EHRs) consist of both structured data (eg, diagnostic codes) and unstructured data (eg, clinical notes). It is commonly believed that unstructured clinical narratives provide more comprehensive information. However, this assumption lacks large-scale validation and direct validation methods.</p>
        </sec>
        <sec sec-type="objective">
          <title>Objective</title>
          <p>This study aims to quantitatively compare the information in structured and unstructured EHR data and directly validate whether unstructured data offers more extensive information across a patient population.</p>
        </sec>
        <sec sec-type="methods">
          <title>Methods</title>
          <p>We analyzed both structured and unstructured data from patient records and visits in a large Dutch primary care EHR database between January 2021 and January 2024. Clinical concepts were identified from free-text notes using an extraction framework tailored for Dutch and compared with concepts from structured data. Concept embeddings were generated to measure semantic similarity between structured and extracted concepts through cosine similarity. A similarity threshold was systematically determined via annotated matches and minimized weighted Gini impurity. We then quantified the concept overlap between structured and unstructured data across various concept domains and patient populations.</p>
        </sec>
        <sec sec-type="results">
          <title>Results</title>
          <p>In a population of 1.8 million patients, only 13% of extracted concepts from patient records and 7% from individual visits had similar structured counterparts. Conversely, 42% of structured concepts in records and 25% in visits had similar matches in unstructured data. Condition concepts had the highest overlap, followed by measurements and drug concepts. Subpopulation visits, such as those with chronic conditions or psychological disorders, showed different proportions of data overlap, indicating varied reliance on structured versus unstructured data across clinical contexts.</p>
        </sec>
        <sec sec-type="conclusions">
          <title>Conclusions</title>
          <p>Our study demonstrates the feasibility of quantifying the information difference between structured and unstructured data, showing that the unstructured data provides important additional information in the studied database and populations. The annotated concept matches are made publicly available for the clinical natural language processing community. Despite some limitations, our proposed methodology proves versatile, and its application can lead to more robust and insightful observational clinical research.</p>
        </sec>
      </abstract>
      <kwd-group>
        <kwd>natural language processing</kwd>
        <kwd>named entity recognition</kwd>
        <kwd>clinical concept extraction</kwd>
        <kwd>machine learning</kwd>
        <kwd>electronic health records</kwd>
        <kwd>EHR</kwd>
        <kwd>word embeddings</kwd>
        <kwd>clinical concept similarity</kwd>
        <kwd>text mining</kwd>
        <kwd>code</kwd>
        <kwd>free-text</kwd>
        <kwd>information</kwd>
        <kwd>electronic record</kwd>
        <kwd>data</kwd>
        <kwd>patient records</kwd>
        <kwd>framework</kwd>
        <kwd>structured data</kwd>
        <kwd>unstructured data</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <sec sec-type="introduction">
      <title>Introduction</title>
      <p>Electronic health records (EHRs), originally designed for clinical documentation and administration, are now increasingly used in observational research [<xref ref-type="bibr" rid="ref1">1</xref>,<xref ref-type="bibr" rid="ref2">2</xref>], supporting various types of studies, including case studies, patient cohort characterizations, and clinical prediction modeling. EHR data are generally recorded in 2 forms: structured and unstructured data. Structured data includes clinical codes for documenting clinical events, such as diagnoses, medications, procedures, and measurements. Structured data is particularly suitable for observational research due to its consistent meaning, tabular format, and standardized vocabulary of codes. Unstructured data consists of free-text clinical notes, which can provide detailed descriptions capturing the nuances of patient care, such as physician observations, patient histories, diagnostic impressions, and discharge summaries. Although rich in contextual information, unstructured data poses challenges for direct analysis because of its variability and lack of standardization [<xref ref-type="bibr" rid="ref3">3</xref>,<xref ref-type="bibr" rid="ref4">4</xref>]. Consequently, extracting meaningful information from unstructured data requires significant investment in manual labor, computational resources, and time.</p>
      <p>It is commonly assumed that the text data contains more detailed and extensive information than structured data, based on the often-reported claim—grounded in business-related data [<xref ref-type="bibr" rid="ref5">5</xref>]—that 80% of EHR data is unstructured [<xref ref-type="bibr" rid="ref3">3</xref>,<xref ref-type="bibr" rid="ref4">4</xref>,<xref ref-type="bibr" rid="ref6">6</xref>-<xref ref-type="bibr" rid="ref8">8</xref>]. While this assumption may hold, its validity is influenced by factors like documentation quality and clinical context. For example, intensive care data is characterized by a high frequency of measurements, while psychiatric care relies more on textual narratives. Many studies explored the added value of information from text by comparing analyses with and without it [<xref ref-type="bibr" rid="ref9">9</xref>-<xref ref-type="bibr" rid="ref15">15</xref>], indirectly validating this assumption. However, even if the assumption holds, it initially remains uncertain to what extent the information in the text data matches or complements the structured data.</p>
      <p>Understanding the quantity and differences in the information available in structured and unstructured data for a specific database offers several advantages for observational clinical research. First, it aids study design by identifying the most abundant and reliable data types, enabling researchers to formulate feasible hypotheses and research questions. Second, it allows for more effective allocation of human and computational resources by focusing efforts where they are most needed. Third, knowing the balance between structured and unstructured data helps researchers prioritize according to the study’s specific needs. Finally, it highlights gaps or unique aspects of the data, facilitating the exploration of underused research opportunities.</p>
      <p>Comparing the information from structured and unstructured data involves various measures, such as quantity and content. While structured data points can be counted and unstructured data quantified by individual words or extracted concepts, comparing content similarity is more challenging. The core meaning of both structured codes and unstructured text lies in their semantic content. Evaluating the information distance between 2 concepts or texts requires comparing their semantic meanings, a task commonly addressed in natural language processing through semantic similarity measures. Modern approaches often use word embedding models to generate concept embeddings, which are used in applications like biomedical ontology matching [<xref ref-type="bibr" rid="ref16">16</xref>,<xref ref-type="bibr" rid="ref17">17</xref>] and concept normalization [<xref ref-type="bibr" rid="ref18">18</xref>-<xref ref-type="bibr" rid="ref20">20</xref>]. Specialized embedding models, such as SapBERT [<xref ref-type="bibr" rid="ref21">21</xref>] and BioLORD [<xref ref-type="bibr" rid="ref22">22</xref>], have been developed for this purpose and provide the opportunity to measure the information difference between structured and unstructured data.</p>
      <p>Several studies have compared structured and unstructured data for specific clinical variables, such as social and behavioral determinants of health [<xref ref-type="bibr" rid="ref23">23</xref>,<xref ref-type="bibr" rid="ref24">24</xref>] and smoking history [<xref ref-type="bibr" rid="ref25">25</xref>,<xref ref-type="bibr" rid="ref26">26</xref>]. However, to our knowledge, no research has directly assessed the information differences between structured and unstructured data across all clinical events in a database. Our study aims to quantitatively compare the information coded by general practitioners (GPs) with the information documented in free-text notes, using data from a large Dutch GP database. We extracted clinical concepts from unstructured text and used concept embeddings to calculate their similarity with the structured concepts. After determining a similarity threshold, we estimated the difference and overlap of information between structured and unstructured data.</p>
    </sec>
    <sec sec-type="methods">
      <title>Methods</title>
      <sec>
        <title>Database and Setting</title>
        <p>This study used the Integrated Primary Care Information (IPCI) database [<xref ref-type="bibr" rid="ref27">27</xref>], a longitudinal EHR database of Dutch GPs. IPCI contains records of 2.9 million patients with a median follow-up of 4.8 years, spanning from 1993 to 2024. The database is standardized using the Observational Medical Outcomes Partnership Common Data Model [<xref ref-type="bibr" rid="ref28">28</xref>]. Eligible participants in our study dataset included all patients recorded in the database from January 2021 to January 2024. The study received approval from the IPCI governance board under code 2023-04.</p>
      </sec>
      <sec>
        <title>Methodological Setup</title>
        <p>The methods consist of 4 main parts, visualized in <xref rid="figure1" ref-type="fig">Figure 1</xref>, each described in detail in the following sections. First, we extracted both structured and unstructured data for each eligible patient and applied a concept extraction framework to extract clinical concepts from the free-text notes. Second, we applied 2 different data grouping methods to the population. Third, we used pretrained multilingual concept embeddings to calculate the similarities between the structured and extracted concepts. Finally, we annotated a sample of concept similarity matches to determine a similarity score threshold and quantified the data similarity in the database.</p>
        <fig id="figure1" position="float">
          <label>Figure 1</label>
          <caption>
            <p>Visualization of the methodological setup. Steps 1-4 outline the primary process, including data extraction, data point grouping, concept comparison, and data similarity quantification. Steps A, B, and C detail additional processes, specifically the extraction of clinical concepts, the creation of concept embeddings, and the determination of a similarity threshold. EHR: electronic health record.</p>
          </caption>
          <graphic xlink:href="jmir_v27i1e66910_fig1.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Data Extraction</title>
        <p>We extracted all structured and unstructured data points for each eligible patient (step 1 in <xref rid="figure1" ref-type="fig">Figure 1</xref>). Structured data includes conditions, procedures, prescriptions, measurements, and observations. The coding systems used, such as the International Classification of Primary Care-1, are listed in Table S1 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>. Unstructured data consists of 3 types of notes: subjective, objective, assessment, and plan notes from consultations; referral and communication notes with secondary care; and other notes from free-text fields in the EHR system, primarily accompanying condition codes.</p>
      </sec>
      <sec>
        <title>Clinical Concept Extraction</title>
        <p>Structured coded clinical events are considered single data points, while free-text notes can contain multiple pieces of information embedded in the narrative. To compare these, we extracted individual data points from unstructured text using clinical named entity recognition and linking, generally known as clinical concept extraction (step A in <xref rid="figure1" ref-type="fig">Figure 1</xref>). We used MedSpacy [<xref ref-type="bibr" rid="ref29">29</xref>], a toolkit that extracts clinical concepts based on a reference thesaurus such as the unified medical language system (UMLS) and detects contextual modifiers using language-specific rules. We used a version of MedSpacy adapted for Dutch [<xref ref-type="bibr" rid="ref10">10</xref>,<xref ref-type="bibr" rid="ref30">30</xref>], incorporating all Dutch vocabularies from UMLS and replacing the English Systematized Nomenclature of Medicine Clinical Terms with the Dutch translation [<xref ref-type="bibr" rid="ref31">31</xref>]. We used Dutch context rules to detect qualifiers such as negation, temporality, and experiencer. These rules were previously validated with an annotated corpus [<xref ref-type="bibr" rid="ref32">32</xref>]. The extracted UMLS concepts were mapped to concept domains such as observation, condition, measurement, drug, and procedure in the Observational Medical Outcomes Partnership standardized vocabulary, which contains the majority of UMLS vocabularies, for easy comparison with the structured data.</p>
      </sec>
      <sec>
        <title>Data Point Grouping</title>
        <p>Given the longitudinal nature of GP EHR data, we used 2 grouping methods for comparison: by patient visit and by patient record (step 2 in <xref rid="figure1" ref-type="fig">Figure 1</xref>). Grouping by visit considers data points recorded simultaneously during a patient visit, providing a natural basis for comparison, while grouping by record includes additional information recorded outside visits, such as lab results and secondary care communications, allowing for a broader comparison.</p>
      </sec>
      <sec>
        <title>Concept Comparison</title>
        <p>Each structured or extracted data point is represented by a single clinical concept. We compared the Cartesian product of structured and extracted concepts within each group (step 3 in <xref rid="figure1" ref-type="fig">Figure 1</xref>). For example, a visit with 1 coded condition and 1 prescription, along with a subjective, objective, assessment, and plan note containing 5 extracted concepts, results in 10 comparisons. While we could count exact concept matches, 2 issues arise. First, the compared concepts can be very similar but not identical, for example, the International Classification of Primary Care-1 code “Fracture: hand/foot bone” (L74) and the Systematized Nomenclature of Medicine Clinical Terms concept “Closed fracture of hand” (704005005). Second, the concept vocabularies differ, as the GPs choose from a limited set of codes, while concept extraction uses the complete UMLS.</p>
        <p>To overcome these issues, we used concept embeddings for fuzzy matching, enabling us to measure semantic similarities and recognize similar concepts. Cosine similarity, which measures the angle between 2 embedding vectors, was used to calculate similarities, ranging from –1 (strongly opposite) to 1 (very similar). For each structured and extracted concept, we stored only the highest similarity match, as we are only interested in finding the most similar concept in the other data type.</p>
      </sec>
      <sec>
        <title>Concept Embeddings</title>
        <p>We used the BioLORD-2023-M pretrained sentence transformer model [<xref ref-type="bibr" rid="ref22">22</xref>] to generate the multilingual concept embeddings, as visualized in step B of <xref rid="figure1" ref-type="fig">Figure 1</xref>. BioLORD-2023-M is designed to produce meaningful representations for biomedical concepts across multiple languages, including English and Dutch, and its cross-lingual performance has been evaluated by the authors [<xref ref-type="bibr" rid="ref22">22</xref>]. By inputting the concept description, from either UMLS for the extracted concepts or from the source vocabulary for the structured concepts, the model generates a dense 768-dimensional vector, allowing us to create embeddings for both structured and extracted concepts. For concepts with multiple descriptions or synonyms, we calculated an embedding for each and averaged them to create one comprehensive concept representation. For structured observations and measurements, we included the unit and value in the description to enrich the embedding. The model's multilingual capability enabled us to embed concept descriptions in both Dutch and English within the same latent space.</p>
      </sec>
      <sec>
        <title>Similarity Threshold Determination</title>
        <p>To quantify the information difference between structured and extracted concepts, we needed to define a threshold for concept similarity. Since concept similarity depends on the nature of the embeddings, we developed a systematic method to determine this threshold, as visualized in step C of <xref rid="figure1" ref-type="fig">Figure 1</xref>. First, we randomly selected concept pairs at various similarity levels, ranging from a similarity score of 0.35 to 1, with samples taken at 0.05 intervals. This sampling was done for both structured and extracted concepts across patient visits and records, ensuring each concept domain was represented by sampling 5 concepts per domain. Next, we manually annotated the concept pairs as either similar or nonsimilar. Using these annotations, we determined the threshold at which the weighted Gini impurity [<xref ref-type="bibr" rid="ref33">33</xref>] of the split between similar and nonsimilar matches was lowest.</p>
      </sec>
      <sec>
        <title>Data Similarity Quantification</title>
        <p>Using a similarity threshold and the most similar counterpart for each structured and extracted concept, we determined the number of structured concepts found in free-text (structured-to-unstructured) and the number of extracted concepts that were coded in the structured data (unstructured-to-structured), as shown in step 4 of <xref rid="figure1" ref-type="fig">Figure 1</xref>. It is important to note that these counts are not reciprocal since we consider the maximum similarity per concept. For example, multiple extracted concepts from a patient visit text may be highly similar to a single structured concept, but we only compare the structured concept to its most similar extracted counterpart to determine its presence in the text, not the frequency. We calculated these counts and their percentages across the entire set of concepts, as well as within different concept domains, to explore domain-specific differences. For extracted concepts, we only included those without context modifiers to focus on the core unmodified concepts, which ensures a higher degree of certainty and a more straightforward, reliable comparison.</p>
      </sec>
      <sec>
        <title>Subpopulation Comparison</title>
        <p>While observing data similarity across the entire population or all GP visits is insightful, applying this method to smaller subpopulations may provide further detail. We defined 3 subpopulations based on different types of clinical events: visits for chronic disorder (type 2 diabetes mellitus), acute event (COVID-19 vaccination), and psychological disorder (depression), as detailed in Table S2 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>. We then quantified the similarity between structured and unstructured data for these subpopulations, similar to the full population.</p>
      </sec>
    </sec>
    <sec sec-type="results">
      <title>Results</title>
      <sec>
        <title>Population and Data Characteristics</title>
        <p><xref ref-type="table" rid="table1">Table 1</xref> presents the population and data characteristics for patient records and visits, including details per observation. Additionally, characteristics are listed for 3 subpopulations. Key differences between patient visits and records are evident. First, each patient has one complete record but can have multiple visits. Second, the total number of structured events and their median values indicate that many events are recorded outside GP visits, as their total numbers are roughly 3 times higher in records. Similarly, the total number of clinical notes is twice as high in records compared to visits alone. Third, more extracted concepts per structured event are generally found in patient visits, but the median number of concepts per note is the same.</p>
        <table-wrap position="float" id="table1">
          <label>Table 1</label>
          <caption>
            <p>Population and data characteristics for patient records, patient visits, and 3 subpopulation-specific visits.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="30"/>
            <col width="400"/>
            <col width="0"/>
            <col width="120"/>
            <col width="0"/>
            <col width="110"/>
            <col width="0"/>
            <col width="130"/>
            <col width="0"/>
            <col width="90"/>
            <col width="0"/>
            <col width="120"/>
            <thead>
              <tr valign="top">
                <td colspan="3">Characteristic</td>
                <td colspan="2">Patient records</td>
                <td colspan="2">Patient visits</td>
                <td colspan="2">COVID-19 vaccination visits</td>
                <td colspan="2">Depression visits</td>
                <td>Diabetes visits</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td colspan="12">
                  <bold>Total numbers</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Number of patients</td>
                <td colspan="2">1,794,209</td>
                <td colspan="2">1,507,473</td>
                <td colspan="2">62,724</td>
                <td colspan="2">20,519</td>
                <td colspan="2">51,032</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Number of observations</td>
                <td colspan="2">1,794,209</td>
                <td colspan="2">13,995,524</td>
                <td colspan="2">84,484</td>
                <td colspan="2">63,957</td>
                <td colspan="2">324,698</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Ratio number of observations/number of patients</td>
                <td colspan="2">1</td>
                <td colspan="2">9.28</td>
                <td colspan="2">1.35</td>
                <td colspan="2">3.12</td>
                <td colspan="2">6.36</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Number of structured events</td>
                <td colspan="2">132,287,566</td>
                <td colspan="2">41,232,407</td>
                <td colspan="2">281,606</td>
                <td colspan="2">118,351</td>
                <td colspan="2">4,513,756</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Number of free-text notes</td>
                <td colspan="2">95,790,377</td>
                <td colspan="2">47,915,131</td>
                <td colspan="2">188,318</td>
                <td colspan="2">216,449</td>
                <td colspan="2">1,253,418</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Number of extracted concepts</td>
                <td colspan="2">635,232,943</td>
                <td colspan="2">245,052,660</td>
                <td colspan="2">688,420</td>
                <td colspan="2">1,374,430</td>
                <td colspan="2">5,329,978</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Ratio extracted concepts/structured events</td>
                <td colspan="2">4.80</td>
                <td colspan="2">5.94</td>
                <td colspan="2">2.44</td>
                <td colspan="2">11.61</td>
                <td colspan="2">1.18</td>
              </tr>
              <tr valign="top">
                <td colspan="12">
                  <bold>Statistics per observation</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Percentage male, %</td>
                <td colspan="2">49</td>
                <td colspan="2">49</td>
                <td colspan="2">45</td>
                <td colspan="2">33</td>
                <td colspan="2">51</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Median age (years)</td>
                <td colspan="2">39</td>
                <td colspan="2">39</td>
                <td colspan="2">62</td>
                <td colspan="2">49</td>
                <td colspan="2">68</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Median number of structured events</td>
                <td colspan="2">33</td>
                <td colspan="2">2</td>
                <td colspan="2">2</td>
                <td colspan="2">1</td>
                <td colspan="2">6</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Median number of free-text notes</td>
                <td colspan="2">35</td>
                <td colspan="2">3</td>
                <td colspan="2">2</td>
                <td colspan="2">3</td>
                <td colspan="2">3</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Median number of extracted concepts</td>
                <td colspan="2">172</td>
                <td colspan="2">12</td>
                <td colspan="2">5</td>
                <td colspan="2">15</td>
                <td colspan="2">12</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Median number of concepts per note</td>
                <td colspan="2">3</td>
                <td colspan="2">3</td>
                <td colspan="2">2</td>
                <td colspan="2">3</td>
                <td colspan="2">3</td>
              </tr>
            </tbody>
          </table>
        </table-wrap>
        <p>Characteristics for visits across different clinical events also show clear contrasts. As expected, visits regarding diabetes presented the most visits per patient, while for COVID-19 vaccination, the number of visits per patient was the least. Furthermore, diabetes visits contain much structured and unstructured data, whereas depression visits rely mostly on unstructured data with a large median number of extracted concepts per visit and few structured events. Demographic differences are also notable: diabetes and vaccination populations are similar, but the depression population consists of younger females.</p>
      </sec>
      <sec>
        <title>Determining the Similarity Threshold</title>
        <p>We annotated 1764 matches in 4 concept samples: structured-to-unstructured and unstructured-to-structured in both patient record and visit populations. For transparency and reproducibility, the annotated samples are available in Table S3 in <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>. <xref rid="figure2" ref-type="fig">Figure 2</xref>B visualizes some example extracted concepts matched with increasing similarity to the structured concepts SARS-CoV-2 (COVID-19) and hypertension. The weighted Gini impurity calculated over the binary annotated matches at different similarity thresholds is presented in <xref rid="figure2" ref-type="fig">Figure 2</xref>A. We found the minimum impurity of each of the 4 samples between thresholds of 0.55 and 0.65. To be conservative, we set the threshold at 0.65 for data similarity quantification.</p>
        <fig id="figure2" position="float">
          <label>Figure 2</label>
          <caption>
            <p>Analysis across similarity thresholds. (A) The weighted Gini impurity measures the level of impurity in the annotated concept combinations in the 4 samples over the different thresholds. The points indicate the minimum impurity in each sample, and the dashed lines represent their lower bound (0.55) and upper bound (0.65). (B) Examples of extracted concepts matched to the structured concepts of SARS-CoV-2 (COVID-19) and hypertension across different similarity thresholds. (C) The percentage of structured concepts matched the extracted concepts (blue) over the range of similarity thresholds and vice versa (green) for both patient records and visits. The red dashed line in all figures depicts the determined similarity threshold value of 0.65.</p>
          </caption>
          <graphic xlink:href="jmir_v27i1e66910_fig2.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Similarity of Structured and Unstructured Data</title>
        <p>The percentage of structured concepts that have a similar match to an extracted concept and vice versa over various similarity thresholds is visualized in <xref rid="figure2" ref-type="fig">Figure 2</xref>C for both patient visits and records. At the similarity threshold of 0.65, indicated by the red line in <xref rid="figure2" ref-type="fig">Figure 2</xref>, 42% (55.1 million of 132.1 million) of structured concepts in patient records are similar to a concept extracted from text, compared to 25% (9.3 million of 37.8 million) for visits. In contrast, only 13% (66.9 million of 501.9 million) of extracted concepts are similar to a structured concept in patient records, and 7% (11.3 million of 155.7 million) in visits (<xref rid="figure3" ref-type="fig">Figure 3</xref>A). This indicates that information in the structured data is more likely to be present in the unstructured data than vice versa. The difference between patient records and visits can be explained by the number of available concepts for matching the entire record versus a single visit.</p>
        <fig id="figure3" position="float">
          <label>Figure 3</label>
          <caption>
            <p>Comparison of structured and extracted concepts in patient records and visits. (A) Total number of structured (left) and extracted concepts (right), along with the number of concepts matched at a similarity threshold of 0.65 (blue for structured to unstructured data, green for unstructured to structured). The percentage of matched concepts is listed in the chart for each data type. (B) The proportion of each concept domain contributed to the total number of structured (left) and extracted concepts (right), along with the proportion of concepts in each domain matched to the other data type.</p>
          </caption>
          <graphic xlink:href="jmir_v27i1e66910_fig3.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <p><xref rid="figure3" ref-type="fig">Figure 3</xref>B presents the counts and overlap percentages for different domains of structured and extracted concepts. Primarily structured conditions are often also in the text, with 75% (17.6 million of 23.6 million) for patient records and 55% (4.3 million of 7.8 million) for visits. Similarly, extracted condition concepts are also most often matched with structured concepts, with 36% (23.9 million of 65.6 million) in patient records and 22% (5.1 million of 22.8 million) in visits. Other concept domains, such as measurements and drugs, show a relatively high overlap in both structured and unstructured data as well.</p>
        <p>Interestingly, differences in concept domain numbers between patient visits and records are larger for structured concepts than extracted concepts. For example, the proportion of structured observation concepts is 31% (11.9 million of 37.8 million) in patient visits but 15% (20.4 million of 132.1 million) in patient records, and the reverse is true for drug concepts. However, this difference is not observed in extracted concepts.</p>
      </sec>
      <sec>
        <title>Differences Between Subpopulations</title>
        <p><xref rid="figure4" ref-type="fig">Figure 4</xref>A presents the total concept overlap at the selected threshold for the 3 subpopulations, showing ratios of structured and extracted concepts as seen in <xref ref-type="table" rid="table1">Table 1</xref>. While the proportion of structured data overlapping with unstructured data and vice versa is similar for the vaccination and diabetes populations, the depression population shows a much higher proportion of structured data matched to concepts in the text (43,934/110,317, 40%) and a lower proportion of extracted concepts matched to structured concepts (55,921/1,051,186, 5%). <xref rid="figure4" ref-type="fig">Figure 4</xref>B shows different proportions of concept domains in the structured data across the 3 populations. However, in the extracted concepts, the proportions are relatively similar. The data similarity results are consistent with the total population, with conditions, drugs, and measurement concepts showing the highest overlap between structured and unstructured data.</p>
        <fig id="figure4" position="float">
          <label>Figure 4</label>
          <caption>
            <p>Comparison of structured and extracted concepts in the 3 subpopulation visits. (A) Total number of structured (left) and extracted concepts (right), along with the number of concepts matched at a similarity threshold of 0.65 (blue for structured to unstructured data, green for unstructured to structured). The percentage of matched concepts is listed in the chart for each data type. (B) The proportion of each concept domain contributed to the total number of structured (left) and extracted concepts (right), along with the proportion of concepts in each domain matched to the other data type.</p>
          </caption>
          <graphic xlink:href="jmir_v27i1e66910_fig4.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
    </sec>
    <sec sec-type="discussion">
      <title>Discussion</title>
      <sec>
        <title>Comparing Structured and Unstructured Clinical Data</title>
        <p>This study explored the feasibility of quantifying the information difference between structured and unstructured data in a large primary care EHR database. We used concept embeddings to measure the similarity between structured clinical event concepts and concepts extracted from free-text narratives. By systematically determining a similarity threshold, we found that a substantial proportion of structured information is also present in unstructured data, while only a small portion of unstructured information is reflected in structured data. This indicates that most concepts in one data type do not match those in the other, suggesting that the information in structured and unstructured data is highly complementary. The difference between the data modalities can be attributed to certain types of information being exclusively structured, such as measurements made at the general practice, while other information, like observations reported in unstructured communication of a hospital to a GP, is often not captured in the structured data of the GP record.</p>
        <p>Furthermore, we observed that condition concepts had the largest overlap between structured and unstructured data, followed by measurements and drug concepts. These results were consistent across different data point grouping methods, by patient record or visit. We also quantified the information difference in smaller subpopulations of patients with specific diseases or procedures. Differences in overlap proportions were evident between these subpopulations, but the concept domains with the largest overlap were the same as in the full population. Overall, our findings validate the often assumed notion that unstructured data contains more extensive information than structured data in this specific database. Most importantly, we prove the feasibility of quantifying the information difference in an EHR database by using a combination of clinical concept extraction methods [<xref ref-type="bibr" rid="ref29">29</xref>], high-dimensional clinical concept embeddings for similarity measurement [<xref ref-type="bibr" rid="ref22">22</xref>], and annotating a small number of matched concepts to determine the appropriate similarity threshold.</p>
      </sec>
      <sec>
        <title>Strengths and Limitations</title>
        <p>The study was conducted on a large, private dataset, which limits public reproducibility. However, our approach to compare and quantify differences between structured and unstructured data is dataset-agnostic. With the detailed pipeline and methodology, the work is adaptable and replicable to any EHR dataset containing both data types. There are several limitations to our methodology. The results depend heavily on 3 factors: the type and quality of concept extraction, the type of concept embeddings used for determining concept similarity, and the chosen similarity threshold. First, the performance of named entity recognition and entity linking is crucial for extracting all relevant information from unstructured data. Imperfect extraction can lead to missed concepts (false negatives) that cannot be matched to structured concepts and wrongly extracted concepts (false positives) that distort the results. The MedSpacy extraction framework used in this work was validated for different languages and demonstrates good performance [<xref ref-type="bibr" rid="ref29">29</xref>,<xref ref-type="bibr" rid="ref30">30</xref>]. It was chosen for its availability in Dutch, its versatility with ontologies, and its extraction speed. Second, different types of word and concept embeddings may vary in their ability to distinguish between similar and nonsimilar clinical concepts, affecting the distribution of similarity scores. While other, more complex embeddings might improve similarity discrimination, we only tested the multilingual BioLord-2023-M embedding model [<xref ref-type="bibr" rid="ref22">22</xref>], as it was specifically trained to produce meaningful representations for clinical sentences and biomedical concepts across different languages, making it perfectly suited to our application. Third, the chosen similarity threshold impacts the results. Setting this threshold can be subjective and heavily depends on the concept embeddings. Therefore, we used a systematic approach using the minimization of the Gini impurity to determine the threshold, and we published the annotated concept matches for transparency. Lastly, data grouping and the ordering of data points in time pose a challenge. We used 2 grouping methods that ignored the time aspect within each group to explore the differences. However, more sophisticated approaches might be necessary depending on the research question. For instance, data recorded outside patient visits may be assigned to the nearest visit, or more advanced sliding time windows could be used to group the data points.</p>
        <p>Overall, the strength of this study lies in enhancing our understanding of the information overlap between structured and unstructured clinical data within a large GP EHR database covering 1.8 million patients. Our methodology of using concept embeddings to calculate the similarity between clinical concepts offers a versatile and language-independent solution, demonstrated here for Dutch, ensuring accurate comparisons by capturing nuances in clinical terminology.</p>
      </sec>
      <sec>
        <title>Future Work</title>
        <p>Future research in quantifying the data similarity or difference between different data types could explore more advanced concept extraction and embedding techniques or alternative similarity measurements beyond cosine similarity. Our study focused on individual concepts for comparison, but combining multiple concepts might yield higher similarity matches. Investigating the use of concept n-grams and further incorporating context modifiers of the extracted concepts could be beneficial. We applied concept extraction before embedding to retain the granular meaning of individual events within a document. This approach ensures that specific events—such as symptoms, prescriptions, and procedures—are not lost in broader document or sentence embeddings. Future research could explore directly comparing structured concepts and free text without prior clinical concept extraction to enhance the method's reliability and applicability.</p>
        <p>We used a single similarity threshold across all concept domains and populations. Future work could involve establishing separate thresholds for each concept domain and population for a more refined comparison. Additionally, considering distinct thresholds for Dutch-to-Dutch and Dutch-to-English concept comparisons might be beneficial, as cross-lingual similarities may vary. The potential applications of our methodology extend beyond the clinical domain. With the appropriate information extraction framework and embedding model, our approach could be adapted to various settings. Conducting specific clinical case studies to demonstrate the benefits of leveraging the data-type differences would be a logical next step for research.</p>
      </sec>
      <sec>
        <title>Conclusions</title>
        <p>Our study aimed to assess the feasibility of quantifying the information difference between structured and unstructured data in EHR databases. Within a large Dutch primary care EHR database, we successfully demonstrated that unstructured data provides more extensive information than structured data, quantitatively validating this prevailing assumption. By leveraging concept embeddings to measure semantic similarity between structured concepts and those extracted from free-text narratives, we found that a significant portion of structured information is present in unstructured data, while the reverse occurs much less frequently. Notably, concept domains such as conditions, measurements, and drugs exhibited the largest overlap. Despite limitations related to the performance of concept extraction, the type of embeddings used, and the determination of similarity thresholds, our methodology is versatile and was applied across different data grouping methods and subpopulations. The exploration of more sophisticated techniques could further enhance the accuracy and applicability of this approach. We suggest that structured and unstructured data should be used together, as their combined information exceeds that of each data type separately. Understanding the extent and nature of information in structured and unstructured data within a database can enhance study design, research exploration, resource allocation, and data prioritization, ultimately leading to more robust and insightful observational clinical research.</p>
      </sec>
    </sec>
  </body>
  <back>
    <app-group>
      <supplementary-material id="app1">
        <label>Multimedia Appendix 1</label>
        <p>Supplementary material.</p>
        <media xlink:href="jmir_v27i1e66910_app1.docx" xlink:title="DOCX File , 34 KB"/>
      </supplementary-material>
      <supplementary-material id="app2">
        <label>Multimedia Appendix 2</label>
        <p>Table S3: annotated concept matches.</p>
        <media xlink:href="jmir_v27i1e66910_app2.zip" xlink:title="ZIP File  (Zip Archive), 98 KB"/>
      </supplementary-material>
    </app-group>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">EHR</term>
          <def>
            <p>electronic health record</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb2">GP</term>
          <def>
            <p>general practitioner</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb3">IPCI</term>
          <def>
            <p>Integrated Primary Care Information</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb4">UMLS</term>
          <def>
            <p>unified medical language system</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <ack>
      <p>This work has received support from the European Health Data &#38; Evidence Network (EHDEN) project. EHDEN has received funding from the Innovative Medicines Initiative 2 Joint Undertaking (JU) under grant agreement number 806968. The JU receives support from the European Union’s Horizon 2020 research and innovation program and EFPIA.</p>
    </ack>
    <fn-group>
      <fn fn-type="con">
        <p>TMS proposed the methodology, designed and implemented the study protocol, and performed the data analysis. JAK, EMvM, and PRR provided critical feedback, helped interpret the results, and shaped the research and analysis. TMS wrote the article with valuable input from all other authors.</p>
      </fn>
      <fn fn-type="conflict">
        <p>None declared.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Knevel</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Liao</surname>
              <given-names>KP</given-names>
            </name>
          </person-group>
          <article-title>From real-world electronic health record data to real-world results using artificial intelligence</article-title>
          <source>Ann Rheum Dis</source>
          <year>2023</year>
          <month>03</month>
          <volume>82</volume>
          <issue>3</issue>
          <fpage>306</fpage>
          <lpage>311</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://ard.bmj.com/lookup/pmidlookup?view=long&#38;pmid=36150748"/>
          </comment>
          <pub-id pub-id-type="doi">10.1136/ard-2022-222626</pub-id>
          <pub-id pub-id-type="medline">36150748</pub-id>
          <pub-id pub-id-type="pii">ard-2022-222626</pub-id>
          <pub-id pub-id-type="pmcid">PMC9933153</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Reps</surname>
              <given-names>JM</given-names>
            </name>
            <name name-style="western">
              <surname>Schuemie</surname>
              <given-names>MJ</given-names>
            </name>
            <name name-style="western">
              <surname>Suchard</surname>
              <given-names>MA</given-names>
            </name>
            <name name-style="western">
              <surname>Ryan</surname>
              <given-names>PB</given-names>
            </name>
            <name name-style="western">
              <surname>Rijnbeek</surname>
              <given-names>PR</given-names>
            </name>
          </person-group>
          <article-title>Design and implementation of a standardized framework to generate and evaluate patient-level prediction models using observational healthcare data</article-title>
          <source>J Am Med Inform Assoc</source>
          <year>2018</year>
          <month>08</month>
          <day>01</day>
          <volume>25</volume>
          <issue>8</issue>
          <fpage>969</fpage>
          <lpage>975</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/29718407"/>
          </comment>
          <pub-id pub-id-type="doi">10.1093/jamia/ocy032</pub-id>
          <pub-id pub-id-type="medline">29718407</pub-id>
          <pub-id pub-id-type="pii">4989437</pub-id>
          <pub-id pub-id-type="pmcid">PMC6077830</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Assale</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Dui</surname>
              <given-names>LG</given-names>
            </name>
            <name name-style="western">
              <surname>Cina</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Seveso</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Cabitza</surname>
              <given-names>F</given-names>
            </name>
          </person-group>
          <article-title>The revival of the notes field: leveraging the unstructured content in electronic health records</article-title>
          <source>Front Med (Lausanne)</source>
          <year>2019</year>
          <volume>6</volume>
          <fpage>66</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/31058150"/>
          </comment>
          <pub-id pub-id-type="doi">10.3389/fmed.2019.00066</pub-id>
          <pub-id pub-id-type="medline">31058150</pub-id>
          <pub-id pub-id-type="pmcid">PMC6478793</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Li</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Pan</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Goldwasser</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Verma</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Wong</surname>
              <given-names>WP</given-names>
            </name>
            <name name-style="western">
              <surname>Nuzumlalı</surname>
              <given-names>MY</given-names>
            </name>
            <name name-style="western">
              <surname>Rosand</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Chang</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Taylor</surname>
              <given-names>RA</given-names>
            </name>
            <name name-style="western">
              <surname>Krumholz</surname>
              <given-names>HM</given-names>
            </name>
            <name name-style="western">
              <surname>Radev</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>Neural natural language processing for unstructured data in electronic health records: a review</article-title>
          <source>Comput Sci Rev</source>
          <year>2022</year>
          <month>11</month>
          <volume>46</volume>
          <fpage>100511</fpage>
          <pub-id pub-id-type="doi">10.1016/j.cosrev.2022.100511</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Murdoch</surname>
              <given-names>TB</given-names>
            </name>
            <name name-style="western">
              <surname>Detsky</surname>
              <given-names>AS</given-names>
            </name>
          </person-group>
          <article-title>The inevitable application of big data to health care</article-title>
          <source>J Am Med Assoc</source>
          <year>2013</year>
          <month>04</month>
          <day>03</day>
          <volume>309</volume>
          <issue>13</issue>
          <fpage>1351</fpage>
          <lpage>2</lpage>
          <pub-id pub-id-type="doi">10.1001/jama.2013.393</pub-id>
          <pub-id pub-id-type="medline">23549579</pub-id>
          <pub-id pub-id-type="pii">1674245</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref6">
        <label>6</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Chiu</surname>
              <given-names>CC</given-names>
            </name>
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>CM</given-names>
            </name>
            <name name-style="western">
              <surname>Chien</surname>
              <given-names>TN</given-names>
            </name>
            <name name-style="western">
              <surname>Kao</surname>
              <given-names>LJ</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Chu</surname>
              <given-names>CM</given-names>
            </name>
          </person-group>
          <article-title>Integrating structured and unstructured EHR data for predicting mortality by machine learning and latent Dirichlet allocation method</article-title>
          <source>Int J Environ Res Public Health</source>
          <year>2023</year>
          <month>02</month>
          <day>28</day>
          <volume>20</volume>
          <issue>5</issue>
          <fpage>4340</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.mdpi.com/resolver?pii=ijerph20054340"/>
          </comment>
          <pub-id pub-id-type="doi">10.3390/ijerph20054340</pub-id>
          <pub-id pub-id-type="medline">36901354</pub-id>
          <pub-id pub-id-type="pii">ijerph20054340</pub-id>
          <pub-id pub-id-type="pmcid">PMC10001457</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref7">
        <label>7</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hashir</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Sawhney</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>Towards unstructured mortality prediction with free-text clinical notes</article-title>
          <source>J Biomed Inform</source>
          <year>2020</year>
          <month>08</month>
          <volume>108</volume>
          <fpage>103489</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://linkinghub.elsevier.com/retrieve/pii/S1532-0464(20)30117-9"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.jbi.2020.103489</pub-id>
          <pub-id pub-id-type="medline">32592755</pub-id>
          <pub-id pub-id-type="pii">S1532-0464(20)30117-9</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref8">
        <label>8</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kong</surname>
              <given-names>HJ</given-names>
            </name>
          </person-group>
          <article-title>Managing unstructured big data in healthcare system</article-title>
          <source>Healthc Inform Res</source>
          <year>2019</year>
          <month>01</month>
          <volume>25</volume>
          <issue>1</issue>
          <fpage>1</fpage>
          <lpage>2</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/30788175"/>
          </comment>
          <pub-id pub-id-type="doi">10.4258/hir.2019.25.1.1</pub-id>
          <pub-id pub-id-type="medline">30788175</pub-id>
          <pub-id pub-id-type="pmcid">PMC6372467</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref9">
        <label>9</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Seinen</surname>
              <given-names>TM</given-names>
            </name>
            <name name-style="western">
              <surname>Fridgeirsson</surname>
              <given-names>EA</given-names>
            </name>
            <name name-style="western">
              <surname>Ioannou</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Jeannetot</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>John</surname>
              <given-names>LH</given-names>
            </name>
            <name name-style="western">
              <surname>Kors</surname>
              <given-names>JA</given-names>
            </name>
            <name name-style="western">
              <surname>Markus</surname>
              <given-names>AF</given-names>
            </name>
            <name name-style="western">
              <surname>Pera</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Rekkas</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Williams</surname>
              <given-names>RD</given-names>
            </name>
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>van Mulligen</surname>
              <given-names>EM</given-names>
            </name>
            <name name-style="western">
              <surname>Rijnbeek</surname>
              <given-names>PR</given-names>
            </name>
          </person-group>
          <article-title>Use of unstructured text in prognostic clinical prediction models: a systematic review</article-title>
          <source>J Am Med Inform Assoc</source>
          <year>2022</year>
          <month>06</month>
          <day>14</day>
          <volume>29</volume>
          <issue>7</issue>
          <fpage>1292</fpage>
          <lpage>1302</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/35475536"/>
          </comment>
          <pub-id pub-id-type="doi">10.1093/jamia/ocac058</pub-id>
          <pub-id pub-id-type="medline">35475536</pub-id>
          <pub-id pub-id-type="pii">6574714</pub-id>
          <pub-id pub-id-type="pmcid">PMC9196702</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref10">
        <label>10</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Seinen</surname>
              <given-names>TM</given-names>
            </name>
            <name name-style="western">
              <surname>Kors</surname>
              <given-names>JA</given-names>
            </name>
            <name name-style="western">
              <surname>van Mulligen</surname>
              <given-names>EM</given-names>
            </name>
            <name name-style="western">
              <surname>Fridgeirsson</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Rijnbeek</surname>
              <given-names>PR</given-names>
            </name>
          </person-group>
          <article-title>The added value of text from Dutch general practitioner notes in predictive modeling</article-title>
          <source>J Am Med Inform Assoc</source>
          <year>2023</year>
          <month>11</month>
          <day>17</day>
          <volume>30</volume>
          <issue>12</issue>
          <fpage>1973</fpage>
          <lpage>1984</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/37587084"/>
          </comment>
          <pub-id pub-id-type="doi">10.1093/jamia/ocad160</pub-id>
          <pub-id pub-id-type="medline">37587084</pub-id>
          <pub-id pub-id-type="pii">7243430</pub-id>
          <pub-id pub-id-type="pmcid">PMC10654855</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref11">
        <label>11</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kharrazi</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Anzaldi</surname>
              <given-names>LJ</given-names>
            </name>
            <name name-style="western">
              <surname>Hernandez</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Davison</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Boyd</surname>
              <given-names>CM</given-names>
            </name>
            <name name-style="western">
              <surname>Leff</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Kimura</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Weiner</surname>
              <given-names>JP</given-names>
            </name>
          </person-group>
          <article-title>The value of unstructured electronic health record data in geriatric syndrome case identification</article-title>
          <source>J Am Geriatr Soc</source>
          <year>2018</year>
          <month>08</month>
          <volume>66</volume>
          <issue>8</issue>
          <fpage>1499</fpage>
          <lpage>1507</lpage>
          <pub-id pub-id-type="doi">10.1111/jgs.15411</pub-id>
          <pub-id pub-id-type="medline">29972595</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref12">
        <label>12</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>DD</given-names>
            </name>
            <name name-style="western">
              <surname>Yin</surname>
              <given-names>CC</given-names>
            </name>
            <name name-style="western">
              <surname>Zeng</surname>
              <given-names>JC</given-names>
            </name>
            <name name-style="western">
              <surname>Yuan</surname>
              <given-names>XH</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>P</given-names>
            </name>
          </person-group>
          <article-title>Combining structured and unstructured data for predictive models: a deep learning approach</article-title>
          <source>BMC Med Inform Decis Mak</source>
          <year>2020</year>
          <month>10</month>
          <day>29</day>
          <volume>20</volume>
          <issue>1</issue>
          <fpage>280</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://bmcmedinformdecismak.biomedcentral.com/articles/10.1186/s12911-020-01297-6"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/s12911-020-01297-6</pub-id>
          <pub-id pub-id-type="medline">33121479</pub-id>
          <pub-id pub-id-type="pii">10.1186/s12911-020-01297-6</pub-id>
          <pub-id pub-id-type="pmcid">PMC7596962</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref13">
        <label>13</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Gan</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Chang</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>DY</given-names>
            </name>
            <name name-style="western">
              <surname>Park</surname>
              <given-names>RW</given-names>
            </name>
          </person-group>
          <article-title>Enhancing readmission prediction models by integrating insights from home healthcare notes: retrospective cohort study</article-title>
          <source>Int J Nurs Stud</source>
          <year>2024</year>
          <month>10</month>
          <volume>158</volume>
          <fpage>104850</fpage>
          <pub-id pub-id-type="doi">10.1016/j.ijnurstu.2024.104850</pub-id>
          <pub-id pub-id-type="medline">39024965</pub-id>
          <pub-id pub-id-type="pii">S0020-7489(24)00163-9</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref14">
        <label>14</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Marafino</surname>
              <given-names>BJ</given-names>
            </name>
            <name name-style="western">
              <surname>Park</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Davies</surname>
              <given-names>JM</given-names>
            </name>
            <name name-style="western">
              <surname>Thombley</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Luft</surname>
              <given-names>HS</given-names>
            </name>
            <name name-style="western">
              <surname>Sing</surname>
              <given-names>DC</given-names>
            </name>
            <name name-style="western">
              <surname>Kazi</surname>
              <given-names>DS</given-names>
            </name>
            <name name-style="western">
              <surname>DeJong</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Boscardin</surname>
              <given-names>WJ</given-names>
            </name>
            <name name-style="western">
              <surname>Dean</surname>
              <given-names>ML</given-names>
            </name>
            <name name-style="western">
              <surname>Dudley</surname>
              <given-names>RA</given-names>
            </name>
          </person-group>
          <article-title>Validation of prediction models for critical care outcomes using natural language processing of electronic health record data</article-title>
          <source>JAMA Netw Open</source>
          <year>2018</year>
          <month>12</month>
          <day>07</day>
          <volume>1</volume>
          <issue>8</issue>
          <fpage>e185097</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/30646310"/>
          </comment>
          <pub-id pub-id-type="doi">10.1001/jamanetworkopen.2018.5097</pub-id>
          <pub-id pub-id-type="medline">30646310</pub-id>
          <pub-id pub-id-type="pii">2719128</pub-id>
          <pub-id pub-id-type="pmcid">PMC6324323</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref15">
        <label>15</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Tsui</surname>
              <given-names>FR</given-names>
            </name>
            <name name-style="western">
              <surname>Shi</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Ruiz</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Ryan</surname>
              <given-names>ND</given-names>
            </name>
            <name name-style="western">
              <surname>Biernesser</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Iyengar</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Walsh</surname>
              <given-names>CG</given-names>
            </name>
            <name name-style="western">
              <surname>Brent</surname>
              <given-names>DA</given-names>
            </name>
          </person-group>
          <article-title>Natural language processing and machine learning of electronic health records for prediction of first-time suicide attempts</article-title>
          <source>JAMIA Open</source>
          <year>2021</year>
          <month>01</month>
          <volume>4</volume>
          <issue>1</issue>
          <fpage>ooab011</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/33758800"/>
          </comment>
          <pub-id pub-id-type="doi">10.1093/jamiaopen/ooab011</pub-id>
          <pub-id pub-id-type="medline">33758800</pub-id>
          <pub-id pub-id-type="pii">ooab011</pub-id>
          <pub-id pub-id-type="pmcid">PMC7966858</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref16">
        <label>16</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Park</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Hwang</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>Concept embedding to measure semantic relatedness for biomedical information ontologies</article-title>
          <source>J Biomed Inform</source>
          <year>2019</year>
          <month>06</month>
          <volume>94</volume>
          <fpage>103182</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://linkinghub.elsevier.com/retrieve/pii/S1532-0464(19)30100-5"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.jbi.2019.103182</pub-id>
          <pub-id pub-id-type="medline">31009761</pub-id>
          <pub-id pub-id-type="pii">S1532-0464(19)30100-5</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref17">
        <label>17</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Lai</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>He</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Zhao</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Ontology matching with word embeddings. Chinese computational linguistics and natural language processing based on naturally annotated big data</article-title>
          <year>2014</year>
          <conf-name>13th China National Conference, CCL 2014, and Second International Symposium, NLP-NABD 2014, Wuhan</conf-name>
          <conf-date>2014 October 18-19</conf-date>
          <conf-loc>China</conf-loc>
          <publisher-name>Springer</publisher-name>
          <pub-id pub-id-type="doi">10.1007/978-3-319-12277-9</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref18">
        <label>18</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Abdulnazar</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Kreuzthaler</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Roller</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Schulz</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>SapBERT-based medical concept normalization using SNOMED CT</article-title>
          <source>Caring is Sharing? Exploiting the Value in Data for Health and Innovation</source>
          <year>2023</year>
          <publisher-loc>USA</publisher-loc>
          <publisher-name>IOS Press</publisher-name>
          <fpage>825</fpage>
          <lpage>826</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref19">
        <label>19</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zahra</surname>
              <given-names>FA</given-names>
            </name>
            <name name-style="western">
              <surname>Kate</surname>
              <given-names>RJ</given-names>
            </name>
          </person-group>
          <article-title>Obtaining clinical term embeddings from SNOMED CT ontology</article-title>
          <source>J Biomed Inform</source>
          <year>2024</year>
          <month>01</month>
          <volume>149</volume>
          <fpage>104560</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://linkinghub.elsevier.com/retrieve/pii/S1532-0464(23)00281-2"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.jbi.2023.104560</pub-id>
          <pub-id pub-id-type="medline">38070816</pub-id>
          <pub-id pub-id-type="pii">S1532-0464(23)00281-2</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref20">
        <label>20</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Xu</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Miller</surname>
              <given-names>T</given-names>
            </name>
          </person-group>
          <article-title>A simple neural vector space model for medical concept normalization using concept embeddings</article-title>
          <source>J Biomed Inform</source>
          <year>2022</year>
          <month>06</month>
          <volume>130</volume>
          <fpage>104080</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://linkinghub.elsevier.com/retrieve/pii/S1532-0464(22)00096-X"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.jbi.2022.104080</pub-id>
          <pub-id pub-id-type="medline">35472514</pub-id>
          <pub-id pub-id-type="pii">S1532-0464(22)00096-X</pub-id>
          <pub-id pub-id-type="pmcid">PMC9351985</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref21">
        <label>21</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Vuli</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Korhonen</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Collier</surname>
              <given-names>N</given-names>
            </name>
          </person-group>
          <article-title>Learning domain-specialised representations for cross-lingual biomedical entity linking</article-title>
          <source>Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing (Volume 2: Short Papers)</source>
          <year>2021</year>
          <conf-name>The 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing</conf-name>
          <conf-date>Aug</conf-date>
          <conf-loc>Online</conf-loc>
          <publisher-name>Association for Computational Linguistics</publisher-name>
          <fpage>565</fpage>
          <lpage>574</lpage>
          <pub-id pub-id-type="doi">10.18653/v1/2021.acl-short.72</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref22">
        <label>22</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Remy</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Demuynck</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Demeester</surname>
              <given-names>T</given-names>
            </name>
          </person-group>
          <article-title>BioLORD-2023: semantic textual representations fusing large language models and clinical knowledge graph insights</article-title>
          <source>J Am Med Inform Assoc</source>
          <year>2024</year>
          <month>09</month>
          <day>01</day>
          <volume>31</volume>
          <issue>9</issue>
          <fpage>1844</fpage>
          <lpage>1855</lpage>
          <pub-id pub-id-type="doi">10.1093/jamia/ocae029</pub-id>
          <pub-id pub-id-type="medline">38412333</pub-id>
          <pub-id pub-id-type="pii">7614965</pub-id>
          <pub-id pub-id-type="pmcid">PMC11339519</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref23">
        <label>23</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hatef</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Rouhizadeh</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Tia</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Lasser</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Hill-Briggs</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Marsteller</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Kharrazi</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>Assessing the availability of data on social and behavioral determinants in structured and unstructured electronic health records: a retrospective analysis of a multilevel health care system</article-title>
          <source>JMIR Med Inform</source>
          <year>2019</year>
          <month>08</month>
          <day>02</day>
          <volume>7</volume>
          <issue>3</issue>
          <fpage>e13802</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://medinform.jmir.org/2019/3/e13802/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/13802</pub-id>
          <pub-id pub-id-type="medline">31376277</pub-id>
          <pub-id pub-id-type="pii">v7i3e13802</pub-id>
          <pub-id pub-id-type="pmcid">PMC6696855</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref24">
        <label>24</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Bucher</surname>
              <given-names>BT</given-names>
            </name>
            <name name-style="western">
              <surname>Shi</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Pettit</surname>
              <given-names>RJ</given-names>
            </name>
            <name name-style="western">
              <surname>Ferraro</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Chapman</surname>
              <given-names>WW</given-names>
            </name>
            <name name-style="western">
              <surname>Gundlapalli</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Determination of marital status of patients from structured and unstructured electronic healthcare data</article-title>
          <source>AMIA Annu Symp Proc</source>
          <year>2019</year>
          <volume>2020</volume>
          <fpage>267</fpage>
          <lpage>274</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/32308819"/>
          </comment>
          <pub-id pub-id-type="medline">32308819</pub-id>
          <pub-id pub-id-type="pmcid">PMC7153091</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref25">
        <label>25</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ruckdeschel</surname>
              <given-names>JC</given-names>
            </name>
            <name name-style="western">
              <surname>Riley</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Parsatharathy</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Chamarthi</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Rajagopal</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Hsu</surname>
              <given-names>HS</given-names>
            </name>
            <name name-style="western">
              <surname>Mangold</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Driscoll</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>Unstructured data are superior to structured data for eliciting quantitative smoking history from the electronic health record</article-title>
          <source>JCO Clin Cancer Inform</source>
          <year>2023</year>
          <month>02</month>
          <volume>7</volume>
          <fpage>e2200155</fpage>
          <pub-id pub-id-type="doi">10.1200/cci.22.00155</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref26">
        <label>26</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Ruan</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>Comparison of three information sources for smoking information in electronic health records</article-title>
          <source>Cancer Inform</source>
          <year>2016</year>
          <month>12</month>
          <day>08</day>
          <volume>15</volume>
          <fpage>CIN.S40604</fpage>
          <pub-id pub-id-type="doi">10.4137/cin.s40604</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref27">
        <label>27</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>de Ridder</surname>
              <given-names>MA</given-names>
            </name>
            <name name-style="western">
              <surname>de Wilde</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>de Ben</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Leyba</surname>
              <given-names>AR</given-names>
            </name>
            <name name-style="western">
              <surname>Mosseveld</surname>
              <given-names>BM</given-names>
            </name>
            <name name-style="western">
              <surname>Verhamme</surname>
              <given-names>KM</given-names>
            </name>
            <name name-style="western">
              <surname>van der Lei</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Rijnbeek</surname>
              <given-names>PR</given-names>
            </name>
          </person-group>
          <article-title>Data resource profile: the Integrated Primary Care Information (IPCI) database, the Netherlands</article-title>
          <source>Int J Epidemiol</source>
          <year>2022</year>
          <volume>51</volume>
          <issue>6</issue>
          <fpage>e314</fpage>
          <lpage>e323</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/35182144"/>
          </comment>
          <pub-id pub-id-type="doi">10.1093/ije/dyac026</pub-id>
          <pub-id pub-id-type="medline">35182144</pub-id>
          <pub-id pub-id-type="pii">6532386</pub-id>
          <pub-id pub-id-type="pmcid">PMC9749682</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref28">
        <label>28</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hripcsak</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Duke</surname>
              <given-names>JD</given-names>
            </name>
            <name name-style="western">
              <surname>Shah</surname>
              <given-names>NH</given-names>
            </name>
            <name name-style="western">
              <surname>Reich</surname>
              <given-names>CG</given-names>
            </name>
            <name name-style="western">
              <surname>Huser</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Schuemie</surname>
              <given-names>MJ</given-names>
            </name>
            <name name-style="western">
              <surname>Suchard</surname>
              <given-names>MA</given-names>
            </name>
            <name name-style="western">
              <surname>Park</surname>
              <given-names>RW</given-names>
            </name>
            <name name-style="western">
              <surname>Wong</surname>
              <given-names>ICK</given-names>
            </name>
            <name name-style="western">
              <surname>Rijnbeek</surname>
              <given-names>PR</given-names>
            </name>
            <name name-style="western">
              <surname>van der Lei</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Pratt</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Norén</surname>
              <given-names>GN</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Stang</surname>
              <given-names>PE</given-names>
            </name>
            <name name-style="western">
              <surname>Madigan</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Ryan</surname>
              <given-names>PB</given-names>
            </name>
          </person-group>
          <article-title>Observational Health Data Sciences and Informatics (OHDSI): opportunities for observational researchers</article-title>
          <source>Stud Health Technol Inform</source>
          <year>2015</year>
          <volume>216</volume>
          <fpage>574</fpage>
          <lpage>578</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/26262116"/>
          </comment>
          <pub-id pub-id-type="doi">10.3233/978-1-61499-564-7-574</pub-id>
          <pub-id pub-id-type="medline">26262116</pub-id>
          <pub-id pub-id-type="pmcid">PMC4815923</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref29">
        <label>29</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Eyre</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Chapman</surname>
              <given-names>AB</given-names>
            </name>
            <name name-style="western">
              <surname>Peterson</surname>
              <given-names>KS</given-names>
            </name>
            <name name-style="western">
              <surname>Shi</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Alba</surname>
              <given-names>PR</given-names>
            </name>
            <name name-style="western">
              <surname>Jones</surname>
              <given-names>MM</given-names>
            </name>
            <name name-style="western">
              <surname>Box</surname>
              <given-names>TL</given-names>
            </name>
            <name name-style="western">
              <surname>DuVall</surname>
              <given-names>SL</given-names>
            </name>
            <name name-style="western">
              <surname>Patterson</surname>
              <given-names>OV</given-names>
            </name>
          </person-group>
          <article-title>Launching into clinical space with medspaCy: a new clinical text processing toolkit in Python</article-title>
          <source>AMIA Annu Symp Proc</source>
          <year>2021</year>
          <volume>2021</volume>
          <fpage>438</fpage>
          <lpage>447</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/35308962"/>
          </comment>
          <pub-id pub-id-type="medline">35308962</pub-id>
          <pub-id pub-id-type="pii">3576697</pub-id>
          <pub-id pub-id-type="pmcid">PMC8861690</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref30">
        <label>30</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Seinen</surname>
              <given-names>TM</given-names>
            </name>
            <name name-style="western">
              <surname>Kors</surname>
              <given-names>JA</given-names>
            </name>
            <name name-style="western">
              <surname>van Mulligen</surname>
              <given-names>EM</given-names>
            </name>
            <name name-style="western">
              <surname>Rijnbeek</surname>
              <given-names>PR</given-names>
            </name>
          </person-group>
          <article-title>Annotation-preserving machine translation of English corpora to validate Dutch clinical concept extraction tools</article-title>
          <source>J Am Med Inform Assoc</source>
          <year>2024</year>
          <month>08</month>
          <day>01</day>
          <volume>31</volume>
          <issue>8</issue>
          <fpage>1725</fpage>
          <lpage>1734</lpage>
          <pub-id pub-id-type="doi">10.1093/jamia/ocae159</pub-id>
          <pub-id pub-id-type="medline">38934643</pub-id>
          <pub-id pub-id-type="pii">7697361</pub-id>
          <pub-id pub-id-type="pmcid">PMC11258409</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref31">
        <label>31</label>
        <nlm-citation citation-type="web">
          <source>SNOMED National Release Centre of the Netherlands 2024</source>
          <access-date>2024-09-25</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.snomed.org/member/netherlands">https://www.snomed.org/member/netherlands</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref32">
        <label>32</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Afzal</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Pons</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Kang</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Sturkenboom</surname>
              <given-names>MC</given-names>
            </name>
            <name name-style="western">
              <surname>Schuemie</surname>
              <given-names>MJ</given-names>
            </name>
            <name name-style="western">
              <surname>Kors</surname>
              <given-names>JA</given-names>
            </name>
          </person-group>
          <article-title>ContextD: an algorithm to identify contextual properties of medical terms in a Dutch clinical corpus</article-title>
          <source>BMC Bioinformatics</source>
          <year>2014</year>
          <month>11</month>
          <day>29</day>
          <volume>15</volume>
          <issue>1</issue>
          <fpage>373</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://bmcbioinformatics.biomedcentral.com/articles/10.1186/s12859-014-0373-3"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/s12859-014-0373-3</pub-id>
          <pub-id pub-id-type="medline">25432799</pub-id>
          <pub-id pub-id-type="pii">s12859-014-0373-3</pub-id>
          <pub-id pub-id-type="pmcid">PMC4264258</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref33">
        <label>33</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Nembrini</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>König</surname>
              <given-names>IR</given-names>
            </name>
            <name name-style="western">
              <surname>Wright</surname>
              <given-names>MN</given-names>
            </name>
          </person-group>
          <article-title>The revival of the Gini importance?</article-title>
          <source>Bioinformatics</source>
          <year>2018</year>
          <volume>34</volume>
          <issue>21</issue>
          <fpage>3711</fpage>
          <lpage>3718</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/29757357"/>
          </comment>
          <pub-id pub-id-type="doi">10.1093/bioinformatics/bty373</pub-id>
          <pub-id pub-id-type="medline">29757357</pub-id>
          <pub-id pub-id-type="pii">4994791</pub-id>
          <pub-id pub-id-type="pmcid">PMC6198850</pub-id>
        </nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
