<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">J Med Internet Res</journal-id><journal-id journal-id-type="publisher-id">jmir</journal-id><journal-id journal-id-type="index">1</journal-id><journal-title>Journal of Medical Internet Research</journal-title><abbrev-journal-title>J Med Internet Res</abbrev-journal-title><issn pub-type="epub">1438-8871</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v28i1e90246</article-id><article-id pub-id-type="doi">10.2196/90246</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Addressing Data Quality Challenges in Lung Cancer Data Within the Observational Medical Outcomes Partnership Common Data Model: Observational Study</article-title></title-group><contrib-group><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Declerck</surname><given-names>Jens</given-names></name><degrees>MSc</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Deschepper</surname><given-names>Mieke</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Colpaert</surname><given-names>Kirsten</given-names></name><degrees>Prof Dr</degrees><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Kalra</surname><given-names>Dipak</given-names></name><degrees>Dr med</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Coorevits</surname><given-names>Pascal</given-names></name><degrees>Prof Dr</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib></contrib-group><aff id="aff1"><institution>Department of Public Health and Primary Care, Ghent University, Unit of Medical Informatics and Statistics</institution><addr-line>Corneel Heymanslaan 10</addr-line><addr-line>Ghent</addr-line><country>Belgium</country></aff><aff id="aff2"><institution>The European Institute for Innovation through Health Data</institution><addr-line>Ghent</addr-line><country>Belgium</country></aff><aff id="aff3"><institution>Ghent University Hospital, Data Science Institute</institution><addr-line>Ghent</addr-line><country>Belgium</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Balcarras</surname><given-names>Matthew</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Jouned</surname><given-names>Adnan</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Spotnitz</surname><given-names>Matthew</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Ismayilov</surname><given-names>Rashad</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Jens Declerck, MSc, Department of Public Health and Primary Care, Ghent University, Unit of Medical Informatics and Statistics, Corneel Heymanslaan 10, Ghent, Belgium, 32 0474538199; <email>jens.declerck@i-hd.eu</email></corresp></author-notes><pub-date pub-type="collection"><year>2026</year></pub-date><pub-date pub-type="epub"><day>8</day><month>6</month><year>2026</year></pub-date><volume>28</volume><elocation-id>e90246</elocation-id><history><date date-type="received"><day>23</day><month>12</month><year>2025</year></date><date date-type="rev-recd"><day>04</day><month>05</month><year>2026</year></date><date date-type="accepted"><day>05</day><month>05</month><year>2026</year></date></history><copyright-statement>&#x00A9; Jens Declerck, Mieke Deschepper, Kirsten Colpaert, Dipak Kalra, Pascal Coorevits. Originally published in the Journal of Medical Internet Research (<ext-link ext-link-type="uri" xlink:href="https://www.jmir.org">https://www.jmir.org</ext-link>), 8.6.2026. </copyright-statement><copyright-year>2026</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in the Journal of Medical Internet Research (ISSN 1438-8871), is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://www.jmir.org/">https://www.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://www.jmir.org/2026/1/e90246"/><abstract><sec><title>Background</title><p>The secondary use of health data is essential for advancing medical research and improving clinical practice. The Observational Medical Outcomes Partnership (OMOP) Common Data Model (CDM) enables large-scale, multicenter studies but faces challenges related to consistency, completeness, and transparency during data mapping from original data sources.</p></sec><sec><title>Objective</title><p>This study aimed to evaluate the quality of the mapping process for lung cancer data within the Federated Health Innovation Network project, with a focus on consistency, completeness, and challenges encountered throughout the process.</p></sec><sec sec-type="methods"><title>Methods</title><p>Clinical data from Ghent University Hospital were mapped to the OMOP CDM using a reference data dictionary. Consistency was assessed using Cohen kappa (&#x03BA;) scores, while completeness was evaluated by comparing patient and record counts before and after mapping. Challenges, including unstructured data and an evolving reference standard, were documented and analyzed.</p></sec><sec sec-type="results"><title>Results</title><p>High consistency was observed for structured variables, while some unstructured variables, such as &#x201C;Smoking status,&#x201D; were excluded due to their free-text format and the lack of suitable OMOP concepts. The completeness analysis showed minimal data loss for most structured variables but highlighted substantial challenges associated with unstructured data. Persistent issues included evolving data dictionary versions and mismatches in diagnostic code granularity between institutions, underscoring structural challenges in standardization.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>The transformation of lung cancer data to the OMOP CDM highlighted both technical and systemic challenges, including the handling of unstructured data and the resolution of granularity discrepancies. A multidisciplinary approach involving clinical and technical expertise is crucial for ensuring reliable, high-quality datasets for multicenter research.</p></sec></abstract><kwd-group><kwd>health data quality</kwd><kwd>Observational Medical Outcomes Partnership Common Data Model</kwd><kwd>OMOP CDM</kwd><kwd>primary use</kwd><kwd>secondary use</kwd><kwd>extract, transform, and load</kwd><kwd>ETL</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>The secondary use of health data&#x2014;leveraging existing health information for purposes beyond direct patient care&#x2014;has become a cornerstone for advancing medical research [<xref ref-type="bibr" rid="ref1">1</xref>,<xref ref-type="bibr" rid="ref2">2</xref>], developing health care policies [<xref ref-type="bibr" rid="ref3">3</xref>,<xref ref-type="bibr" rid="ref4">4</xref>], and improving clinical practices [<xref ref-type="bibr" rid="ref5">5</xref>]. By integrating health data from diverse clinical settings, researchers can uncover valuable insights into disease patterns [<xref ref-type="bibr" rid="ref6">6</xref>], treatment outcomes [<xref ref-type="bibr" rid="ref7">7</xref>], and health care processes [<xref ref-type="bibr" rid="ref8">8</xref>]. This approach is particularly crucial for studying rare diseases or uncommon clinical events, in which data from a single source are often insufficient [<xref ref-type="bibr" rid="ref9">9</xref>]. The power of large-scale, multicenter datasets lies in their ability to address complex research questions, but this potential can only be fully realized if data quality is ensured throughout the entire data lifecycle&#x2014;from primary data capture to transformation and integration into a standardized framework for secondary use [<xref ref-type="bibr" rid="ref10">10</xref>,<xref ref-type="bibr" rid="ref11">11</xref>].</p><p>Data quality remains one of the most substantial challenges in the effective secondary use of health data [<xref ref-type="bibr" rid="ref11">11</xref>,<xref ref-type="bibr" rid="ref12">12</xref>]. Poor-quality data can lead to incorrect research findings [<xref ref-type="bibr" rid="ref13">13</xref>], poor clinical decision-making [<xref ref-type="bibr" rid="ref14">14</xref>], and misguided health care policies [<xref ref-type="bibr" rid="ref15">15</xref>]. Data quality is influenced by multiple factors, including the reliability of the primary data sources, the transformation processes used to standardize data, and the quality of the resulting secondary datasets [<xref ref-type="bibr" rid="ref11">11</xref>]. The extract, transform, and load (ETL) process plays a critical role, as it involves consolidating, standardizing, and integrating data from multiple sources. Each stage of the ETL process presents unique challenges and risks to data quality. Errors at any stage can compromise the usability and reliability of the final dataset, leading to potential misinterpretations in downstream analyses [<xref ref-type="bibr" rid="ref16">16</xref>,<xref ref-type="bibr" rid="ref17">17</xref>].</p><p>Although frameworks addressing data quality in primary and secondary datasets are well established [<xref ref-type="bibr" rid="ref11">11</xref>,<xref ref-type="bibr" rid="ref18">18</xref>-<xref ref-type="bibr" rid="ref22">22</xref>], mapping clinical data from different sources into a standardized model such as the Observational Medical Outcomes Partnership (OMOP) Common Data Model (CDM) remains prone to challenges [<xref ref-type="bibr" rid="ref4">4</xref>]. The OMOP CDM is a standardized framework for structuring and analyzing health care data from diverse sources, such as electronic health records (EHRs). By adopting uniform data structures and standardized terminologies, such as Systematized Nomenclature of Medicine&#x2013;Clinical Terms (SNOMED CT), Logical Observation Identifiers Names and Codes (LOINC), and <italic>International Classification of Diseases, 10th Revision</italic> (<italic>ICD-10</italic>), the OMOP CDM facilitates interoperability, collaborative research, and large-scale data analysis. OMOP provides a robust framework for integrating diverse datasets by standardizing both data structure and terminologies [<xref ref-type="bibr" rid="ref3">3</xref>], thereby enabling multicenter research on treatment outcomes [<xref ref-type="bibr" rid="ref23">23</xref>] and health care delivery [<xref ref-type="bibr" rid="ref24">24</xref>]. However, variability in data extraction and mapping practices can introduce inconsistencies, thereby affecting the reliability and reproducibility of research findings [<xref ref-type="bibr" rid="ref25">25</xref>].</p><p>Despite the increasing adoption of the OMOP CDM and the growing usability of secondary datasets [<xref ref-type="bibr" rid="ref26">26</xref>], there is limited guidance on how to systematically evaluate mapping quality or address discrepancies in granularity and completeness [<xref ref-type="bibr" rid="ref27">27</xref>]. This lack of structured approaches for assessing the transformation process creates barriers to new implementations, particularly in multicenter settings.</p><p>This study sought to address these gaps by focusing on the quality of the mapping process during the implementation of the OMOP CDM for lung cancer data. This effort was part of the Federated Health Innovation Network (FHIN) project, an open-source collaboration among Belgian hospitals to develop a fully automated, federated platform aimed at addressing research questions in the field of lung cancer [<xref ref-type="bibr" rid="ref28">28</xref>]. As part of this project, a data dictionary was provided, detailing the mapping of raw data to OMOP CDM concepts. This dictionary, which outlined one-to-one relationships between raw data elements and OMOP CDM concept IDs, served as a reference standard.</p><p>Specifically, this study aimed to explore strategies for preserving data quality during the process of mapping data to the OMOP CDM. The primary objective was to evaluate the quality of the mapping process by examining the completeness and consistency during the mapping process. The secondary objective was to identify the challenges and complexities encountered during the implementation of the OMOP CDM and to develop a practical framework to guide future OMOP implementations.</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Study Design and Setting</title><p>This study was independently conducted by the Data Science Institute (DSI) of Ghent University Hospital and the European Institute for Innovation through Health Data to ensure transparency and traceability of the mapping process of clinical data into the OMOP CDM. This study evaluated the mapping of clinical data to the OMOP CDM within the context of lung cancer data integration, with a focus on reproducibility and data quality assessment. Although not part of the FHIN project, this study aligns with its goals by ensuring rigorous documentation and standardization of the mapping process for lung cancer data. The study design emphasizes reproducibility and adaptability to similar multicenter initiatives.</p></sec><sec id="s2-2"><title>Ethical Considerations</title><p>This study did not undergo a formal institutional review board or research ethics committee assessment because it was based on fully anonymized data and did not involve direct interaction with human participants. No identifiable personal data were accessed, and all data were handled in compliance with applicable data protection regulations.</p></sec><sec id="s2-3"><title>Reference Standard Provided by the FHIN Project</title><p>As part of the FHIN project, a data dictionary was provided that defined key data items relevant to lung cancer and their mapping to specific OMOP concept IDs. This dictionary served as the reference standard for evaluating the consistency of our mapping process. Examples of some of the variables included in the data dictionary are provided in <xref ref-type="table" rid="table1">Table 1</xref>. However, the dictionary lacked critical details, including the original data sources for the variables, the extraction logic (eg, identification of the relevant tables and fields for each data element and transformation of field values to the standard terminology relevant in OMOP), and the rationale behind assigning specific concept IDs. Additionally, the data dictionary evolved throughout the project, reflecting adjustments made as part of the data quality process. These factors complicated efforts to fully replicate the mapping process, potentially introducing variability and bias. For transparency, we based our evaluation on the version of the data dictionary available as of December 1, 2024.</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Example of the data dictionary.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Concept ID</td><td align="left" valign="bottom">Concept name</td><td align="left" valign="bottom">Vocabulary ID</td><td align="left" valign="bottom">Concept code</td><td align="left" valign="bottom">Observational Medical Outcomes Partnership table</td></tr></thead><tbody><tr><td align="left" valign="top">44790293</td><td align="left" valign="top">Radiotherapy delivery</td><td align="left" valign="top">SNOMED CT<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup></td><td align="left" valign="top">231711000000108</td><td align="left" valign="top">PROCEDURE</td></tr><tr><td align="left" valign="top">40483776</td><td align="left" valign="top">Total radiation dose delivered</td><td align="left" valign="top">SNOMED CT</td><td align="left" valign="top">445461008</td><td align="left" valign="top">MEASUREMENT</td></tr><tr><td align="left" valign="top">4155148</td><td align="left" valign="top">Delivered radiation dose</td><td align="left" valign="top">SNOMED CT</td><td align="left" valign="top">371892002</td><td align="left" valign="top">MEASUREMENT</td></tr></tbody></table><table-wrap-foot><fn id="table1fn1"><p><sup>a</sup>SNOMED CT: Systematized Nomenclature of Medicine&#x2013;Clinical Terms.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s2-4"><title>Data Quality Assurance</title><p>Data quality assurance was performed to evaluate completeness and consistency. Completeness was assessed by comparing the total number of patients and records extracted from the raw data sources with those successfully transformed into OMOP CDM tables. Consistency assessment was performed to evaluate the agreement between the OMOP concept IDs assigned during the ETL process and those specified in the reference data dictionary. For each data category, which typically included multiple variables, Cohen &#x03BA; scores [<xref ref-type="bibr" rid="ref29">29</xref>] were calculated at the variable level. Agreement was defined as an exact match between the concept ID assigned during mapping and the expected concept ID in the dictionary. Variables that were unmapped, mismatched, or lacked a valid concept ID were considered disagreements. To report a single consistency score per category, the final &#x03BA; score was calculated as the unweighted average of the individual &#x03BA; scores of all variables within that category, as a descriptive summary measure across heterogeneous variables [<xref ref-type="bibr" rid="ref30">30</xref>]. To capture within-category variability, the SDs of the &#x03BA; scores and the number of variables per category were additionally calculated and reported. This approach allowed a balanced assessment across categories, independent of variable count or complexity, and helped identify specific areas of misalignment in the mapping process.</p></sec><sec id="s2-5"><title>Data Sources and Extraction</title><p>On the basis of the variables defined in the data dictionary, all relevant data items were extracted from the data sources at Ghent University Hospital. The extracted data included records from patients between January 1, 2016, and December 31, 2023. The extraction process involved developing and executing SQL queries to retrieve the specified variables from various hospital databases. These data sources included the DSI&#x2013;Data Warehouse (DWH); Multidisciplinary Oncology Consultation (MOC) application; <italic>Minimale Ziekenhuisgegevens</italic> (MZG); General Laboratory Information Management System; and admission, discharge, and transfer systems, each containing data relevant to the mapping process. The DSI-DWH is a curated database where vital parameters, such as weight and height, are stored. Data with the same meaning (eg, weight) were extracted from different fields within the EHR, cleaned, and standardized. MOC application [<xref ref-type="bibr" rid="ref31">31</xref>] provides essential histology and pathology information, particularly related to cancer cases. MZG [<xref ref-type="bibr" rid="ref32">32</xref>] stores diagnostic codes in <italic>ICD-10</italic> format, which are key for mapping clinical diagnoses. The General Laboratory Information Management System contains analysis codes and laboratory values classified using LOINC. Finally, the admission, discharge, and transfer system contains administrative variables and additional patient characteristics.</p><p>The data were stored in a staging area with tables reflecting the structure of the source systems. This staging area enabled uniform querying across databases, ensuring that data were harmonized before the ETL process. By implementing a structured extraction workflow, errors were minimized, and traceability from source to target was ensured.</p></sec><sec id="s2-6"><title>ETL Process</title><p>The ETL process was implemented to harmonize extracted data into OMOP CDM version 5.4. During extraction, raw data were stored in a centralized Microsoft SQL database for processing. Transformation involved automated and manual mappings to OMOP standards. Automated mappings were conducted with SQL scripts using the OMOP vocabularies. An example of the mapping from <italic>ICD-10</italic> to standard OMOP codes can be found in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>. This process aligned source data with terminologies such as SNOMED CT, LOINC, and <italic>ICD-10</italic>. Manual mappings were facilitated by Keun [<xref ref-type="bibr" rid="ref33">33</xref>], particularly for complex variables such as genetic mutations; tumor, node, and metastasis (TNM) staging; and World Health Organization functional scores. SQL scripts were developed to transform raw data into OMOP-compliant tables. These scripts ensured that variables were assigned to appropriate domains and that data transformations adhered to OMOP guidelines. Special attention was given to the handling of unstructured data, such as free-text variables, which posed challenges during mapping. The final load process was executed using the FHIN tool Rabbit-in-a-Blender [<xref ref-type="bibr" rid="ref34">34</xref>], an ETL pipeline used to transform raw data into the OMOP CDM. Mapping approaches varied by data category, with both automated and manual strategies applied. Manual mapping was used for categories such as &#x201C;Clinical TNM staging,&#x201D; &#x201C;Pathological TNM staging,&#x201D; and &#x201C;Genetic mutations.&#x201D; Automated mapping was applied to standardized classifications, including &#x201C;Histology,&#x201D; &#x201C;Laboratory tests,&#x201D; and &#x201C;Diagnosis.&#x201D;</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>Consistency</title><p>The mapping process involved 12 data categories necessary for transforming lung cancer data into the OMOP CDM. These categories included essential clinical, genetic, and demographic variables such as diagnostic codes, TNM staging, and genetic mutations (eg, Kirsten rat sarcoma and v-raf murine sarcoma viral oncogene homolog B). Mapping methods varied by category, using either manual processes requiring domain expertise or automated methods for which established standards enabled straightforward mapping. Manual mapping was used for categories such as &#x201C;Clinical TNM staging,&#x201D; &#x201C;Pathological TNM staging,&#x201D; and &#x201C;Genetic mutations,&#x201D; where specialized knowledge was critical to ensure accuracy. Automated mapping was applied to standardized classifications, including &#x201C;Histology&#x201D; (based on the International Classification of Diseases for Oncology 3rd edition classification), &#x201C;Laboratory tests&#x201D; (based on the LOINC classification), and &#x201C;Diagnosis&#x201D; (based on the <italic>ICD-10</italic> classification). Additional details can be found in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p><p>To evaluate mapping consistency, &#x03BA; scores were calculated at the variable level within each data category by comparing assigned OMOP concept IDs with those defined in the reference data dictionary. For each category, the reported &#x03BA; represents the unweighted mean of the variable-level &#x03BA; values. In addition, the number of variables per category and the SD were calculated to reflect variability within categories. A mean score of 1 indicates perfect alignment across all variables in that category, while lower scores and higher SD values highlight categories in which mapping challenges or heterogeneity were more pronounced. The mean &#x03BA; score, SDs, and number of variables per category are presented in (<xref ref-type="table" rid="table2">Table 2</xref>).</p><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Mean kappa (&#x03BA;) scores, SDs, and number of variables per category according to the mapping process.</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Mapping processes and categories</td><td align="left" valign="bottom">Variables, n (%)</td><td align="left" valign="bottom" colspan="2">Cohen &#x03BA; score, mean (SD)</td></tr></thead><tbody><tr><td align="left" valign="top" colspan="4">Automated mapping</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Diagnosis</td><td align="left" valign="top">19 (5.3)</td><td align="left" valign="top" colspan="2">0.842 (0.375)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Histology</td><td align="left" valign="top">26 (7.3)</td><td align="left" valign="top" colspan="2">1.000 (0)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Laboratory tests</td><td align="left" valign="top">185 (52)</td><td align="left" valign="top" colspan="2">0.968 (0.178)</td></tr><tr><td align="left" valign="top" colspan="4">Manual mapping</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Clinical TNM<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup> staging</td><td align="left" valign="top">38 (10.7)</td><td align="left" valign="top" colspan="2">0.921 (0.273)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Gender</td><td align="left" valign="top">2 (0.6)</td><td align="left" valign="top" colspan="2">0 (0)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Genetic mutations</td><td align="left" valign="top">4 (1.1)</td><td align="left" valign="top" colspan="2">0.500 (0.577)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Pathological TNM staging</td><td align="left" valign="top">27 (7.6)</td><td align="left" valign="top" colspan="2">0.926 (0.267)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Smoking status</td><td align="left" valign="top">1 (0.3)</td><td align="left" valign="top" colspan="2">0 (N/A<sup><xref ref-type="table-fn" rid="table2fn2">b</xref></sup>)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Therapy procedures</td><td align="left" valign="top">3 (0.8)</td><td align="left" valign="top" colspan="2">0.333 (0.577)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Unit</td><td align="left" valign="top">43 (12.1)</td><td align="left" valign="top" colspan="2">0.372 (0.489)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Value</td><td align="left" valign="top">2 (0.6)</td><td align="left" valign="top" colspan="2">1.000 (0)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>World Health Organization score</td><td align="left" valign="top">6 (1.7)</td><td align="left" valign="top" colspan="2">0.833 (0.408)</td></tr></tbody></table><table-wrap-foot><fn id="table2fn1"><p><sup>a</sup>TNM: tumor, node, and metastasis.</p></fn><fn id="table2fn2"><p><sup>b</sup>N/A: not available.</p></fn></table-wrap-foot></table-wrap><p>High consistency was observed in most categories. Categories such as &#x201C;Value&#x201D; and &#x201C;Histology&#x201D; showed perfect agreement (mean &#x03BA; 1.000, SD 0), indicating fully consistent mappings across all variables within these categories. &#x201C;Laboratory tests&#x201D; and &#x201C;Diagnosis&#x201D; demonstrated strong agreement (mean &#x03BA; 0.968, SD 0.178 and mean &#x03BA; 0.842, SD 0.375, respectively), with variability observed at the variable level.</p><p>Both &#x201C;Clinical TNM staging&#x201D; and &#x201C;Pathological TNM staging&#x201D; showed high mean &#x03BA; values (0.921 and 0.926, respectively) but with notable SDs (0.273 and 0.67, respectively), indicating variability across individual variables. Categories such as &#x201C;Unit&#x201D; (mean &#x03BA; 0.372, SD 0.489) and &#x201C;Therapy procedures&#x201D; (mean &#x03BA; 0.333, SD 0.577) exhibited lower consistency and substantial variability, driven by differences in unit recording practices and the absence of specific OMOP concept IDs for certain therapies (eg, immunotherapy).</p><p>&#x201C;Gender&#x201D; and &#x201C;Smoking status&#x201D; showed no agreement (&#x03BA;=0). For &#x201C;Smoking status,&#x201D; no variability measure could be calculated (n=1), reflecting complete mapping failure due to unstructured source data. For &#x201C;Gender,&#x201D; the &#x03BA; value of 0 was due to the absence of this variable in the reference standard. Consequently, no predefined mapping specification was available, leading to a mismatch between the implemented mapping and the expected reference standard. These findings indicate variability in the data dictionary and the lack of a structured representation of specific variables within the source systems.</p></sec><sec id="s3-2"><title>Completeness</title><p>Completeness was assessed by comparing the number of records and patients before and after mapping. The initial data quality test revealed that only approximately half of the patients were successfully mapped. This low completeness rate was associated with discrepancies in <italic>ICD-10</italic> code mappings and inconsistencies across source systems. For example, some patients diagnosed with lung cancer appeared in the MZG data source but were missing from the MOC application data, or vice versa. These mismatches were accompanied by the temporary exclusion of affected records from the mapped dataset. An iterative data quality improvement process was applied. With each iteration, additional patients were reincluded. By the final iteration, only a single patient remained excluded due to an unresolved classification issue. This patient had relevant clinical data in the laboratory system but was categorized in the source data as an outpatient consultation. As the ETL pipeline was configured to extract only hospitalized patients, this record could not be incorporated. Consequently, all but one patient were successfully included in the final dataset.</p><p>For most variables, patient and record counts remained complete in the final dataset, indicating a successful transformation. However, a few exceptions persisted. The variable &#x201C;Smoking status&#x201D; exhibited complete data loss because it was stored in an unstructured free-text field that combined information on smoking, drug abuse, and alcohol use. This format made it impossible to extract smoking-specific content for standardized mapping. Additionally, minor data loss was observed in the &#x201C;Unit&#x201D; and &#x201C;Diagnosis&#x201D; categories, which stemmed from inconsistencies in data representation or mapping complexity.</p></sec><sec id="s3-3"><title>Challenges Encountered During the Mapping Process</title><p>Implementing the OMOP CDM revealed several structural and semantic challenges that complicated the mapping process. One considerable issue was the difference in data granularity between our hospital and the reference standard. For instance, in the condition table, the <italic>ICD-10</italic> code C34.1, which represents lung cancer of the upper lobe, was inconsistently mapped to different OMOP codes based on whether a 4-digit (C34.1) or 5-digit (C34.10) variation of the code was used. Although there is no clinical difference between codes C34.1 and C34.10, this inconsistency arose because the data dictionary only accounted for 4-digit codes, whereas the system at our hospital used a more granular collection process. Furthermore, SNOMED CT, the standard OMOP vocabulary for conditions, introduced additional complexity by mapping back to multiple codes for a single condition. This duality made maintaining consistency and alignment with the OMOP CDM challenging. This is presented in <xref ref-type="fig" rid="figure1">Figure 1</xref>.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Different Systematized Nomenclature of Medicine&#x2013;Clinical Terms (SNOMED CT) codes derived from code C34.1. <italic>ICD-10</italic>: <italic>International Classification of Diseases, 10th Revision</italic>.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v28i1e90246_fig01.png"/></fig><p>Another major complication was the reliance on free-text fields in the source data. For instance, the &#x201C;Smoking status&#x201D; variable was captured in an unstructured field in the EHR that combined drug abuse, smoking, and alcohol abuse into a single text box. Consequently, it was impossible to reliably determine whether the recorded information referred specifically to smoking, alcohol use, or drug abuse. This unstructured format prevented mapping to standardized OMOP concepts and led to the exclusion of these data during transformation. Notably, no neuro-linguistic programming techniques were used in this research, further limiting the ability to computationally extract and interpret such information. These issues were further exacerbated by the design of our hospital&#x2019;s EHR system, which is a home-grown platform historically optimized for clinical documentation rather than structured data capture. The current configuration of the EHR, with limited use of standardized fields, made the extraction and transformation process more challenging. Height and weight were initially derived from unstructured data fields in the EHR. However, a curation and parsing workflow was established, making these variables usable within the project (as stored in the DSI-DWH). A transition toward more structured data entry has recently been initiated, which is expected to facilitate future data standardization.</p><p>A more subtle but impactful challenge was the evolving state of the data dictionary during the project. As no finalized version was agreed upon at the project&#x2019;s start, the data dictionary continued to evolve, often introducing inconsistencies. For this study, we used the version of the dictionary established on December 1, 2024. However, several updates (such as the change in the source of chemotherapy data from procedures to medication records) required periodic reassessment of our mappings. Although these changes were ultimately controlled for, they contributed to mapping delays and highlighted the need for stable definitions early in such projects.</p><p>Beyond data structure and documentation, the specialized knowledge required for OMOP CDM mapping proved to be a limiting factor. Mapping variables, resolving inconsistencies, and applying the correct logic demanded not only a solid understanding of the OMOP CDM but also clinical insight into the source data. The lack of domain expertise within the team sometimes caused delays, especially when clinical interpretation was needed to resolve ambiguous cases.</p><p>Finally, inconsistencies across the OMOP projects themselves presented challenges. Interactions with other OMOP initiatives revealed differences in data dictionaries and coding approaches. These differences included variations in variable definitions, mapping choices, and levels of coding granularity for similar clinical concepts. Consequently, alignment across projects required additional reconciliation efforts, increasing the complexity of the mapping process.</p></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Findings</title><p>This study evaluated the quality of the mapping process of lung cancer data to the OMOP CDM, with a focus on completeness and consistency, and identified key challenges encountered during implementation. Overall, high consistency was achieved for structured variables, while unstructured data and variability in coding practices posed challenges. Completeness improved substantially through iterative data quality refinement, highlighting the importance of continuous validation during the ETL process. These findings align with the study objectives of assessing mapping quality and identifying barriers to effective OMOP implementation.</p></sec><sec id="s4-2"><title>Data Quality Assessment</title><p>The findings demonstrated variability in consistency and completeness across categories. Variables with structured data and robust reference standards, such as &#x201C;Histology,&#x201D; &#x201C;Value,&#x201D; and &#x201C;Laboratory tests,&#x201D; achieved high &#x03BA; scores with low variability, indicating stable and reproducible mappings across variables. In contrast, categories that required greater clinical interpretation or manual mapping, including &#x201C;Clinical TNM staging,&#x201D; &#x201C;Pathological TNM staging,&#x201D; and &#x201C;Genetic mutations,&#x201D; showed high &#x03BA; values but substantial SDs, reflecting heterogeneous agreement at the variable level. This variability indicates that although overall mapping performance was strong, individual variables within these categories posed specific challenges.</p><p>A key contributor to this variability was differences in coding granularity. Variations in the level of detail captured in source systems, such as the use of 4-digit vs 5-digit <italic>ICD-10</italic> codes, introduced inconsistencies during mapping despite representing clinically equivalent concepts. This reflects a broader challenge in OMOP ETL processes, in which differences in coding specificity across institutions can affect semantic alignment and the consistency of standardized data.</p><p>Categories such as &#x201C;Unit,&#x201D; &#x201C;Therapy procedures,&#x201D; &#x201C;Gender,&#x201D; and &#x201C;Smoking status&#x201D; encountered challenges, reflecting the difficulties associated with ambiguous, incomplete, or unstructured data. These findings were consistent with previous studies, in which effective mappings are established for well-structured and standardized variables [<xref ref-type="bibr" rid="ref35">35</xref>]. Unstructured variables, such as &#x201C;Smoking status,&#x201D; posed a particular challenge. Captured as free text in the EHR, this field combined multiple categories, making it impossible to extract smoking-specific information for mapping to OMOP concepts. These findings highlight a challenge in OMOP implementations related to the handling of unstructured data, which requires additional preprocessing before integration into the standardized model. Previous research has shown that free-text data often leads to data exclusion during OMOP transformation, limiting the accuracy of analyses reliant on such variables [<xref ref-type="bibr" rid="ref3">3</xref>,<xref ref-type="bibr" rid="ref4">4</xref>,<xref ref-type="bibr" rid="ref36">36</xref>].</p><p>The completeness analysis revealed that structured data generally retained patient and record counts after mapping. For instance, variables such as &#x201C;Histology&#x201D; and &#x201C;Radiotherapy&#x201D; achieved nearly complete preservation of records. However, the absence of structured standards for certain categories led to minor data loss. For example, inconsistencies in the recording of units and granularity differences in diagnostic codes resulted in missing data during transformation. Although these losses were minimal, they highlight the need for enhanced preprocessing and harmonization workflows to mitigate discrepancies across source systems. These findings align with previous research that has identified similar challenges [<xref ref-type="bibr" rid="ref4">4</xref>,<xref ref-type="bibr" rid="ref35">35</xref>].</p></sec><sec id="s4-3"><title>Challenges Encountered</title><p>The transformation process of mapping lung cancer data to the OMOP CDM highlights several challenges, encompassing both technical data issues and broader systemic and knowledge-related barriers. Although the technical aspects of the data, such as unstructured text and inconsistent coding practices, are well-recognized sources of data quality issues [<xref ref-type="bibr" rid="ref37">37</xref>], this study demonstrates that the challenges extend beyond these technical constraints.</p><p>The major challenge encountered was related to the data dictionary. This data dictionary, provided by the reference hospital, offered a 1-to-1 mapping between raw data elements and OMOP concept IDs, serving as a useful starting point. However, 2 challenges emerged related to the data dictionary. First, frequent updates by the reference hospital invalidated previously consistent mappings, forcing manual remapping efforts. This not only increased the workload but also introduced a higher risk of mapping errors. These disruptions highlight the critical need to finalize a stable and harmonized data dictionary before initiating the project. Establishing such a standardized data dictionary would minimize unnecessary adjustments and reduce inconsistencies during the mapping process. Second, although the 1-to-1 mapping approach provided by the reference site was initially helpful, the transformation process revealed its limitations. A more enriched data dictionary is essential to support consistent and complete mappings. This enriched version should include detailed mapping rules, clinical context, and clear rationales for assigning raw data elements to specific OMOP IDs to address these gaps.</p><p>Another challenge lies in the variability of how data are collected, structured, and recorded across hospitals. Unstructured data, such as free-text entries in EHRs, often result in missing or unusable data during mapping [<xref ref-type="bibr" rid="ref38">38</xref>]. Similarly, coding discrepancies, such as differences in granularity between source data, can lead to inconsistencies and missing values [<xref ref-type="bibr" rid="ref39">39</xref>]. These technical issues not only reduce the completeness of the mapped data but also hinder its clinical applicability and analytical utility. Differences in granularity, terminology, and data format between hospitals further exacerbate these challenges, introducing biases during the extraction and mapping process.</p><p>Beyond technical challenges, the success of the transformation process depends heavily on the knowledge and expertise of the individuals involved. Mapping requires a deep understanding of the OMOP CDM framework, including its vocabularies, as well as comprehensive knowledge of the clinical and technical aspects of the source data and source systems. Insufficient expertise can result in mapping errors or inconsistencies, particularly for complex variables requiring nuanced interpretation. Differences in how data are collected and structured between the 2 hospitals introduced inconsistencies in the mapping process. Differences in source systems between institutions can introduce biases, as variations in data collection may not always be fully accounted for in the mapping strategy.</p></sec><sec id="s4-4"><title>Implications for Practice and Research</title><p>The findings of this study have several implications for future OMOP implementations. First, they highlight the importance of structured data capture at the source, as unstructured data limits downstream usability. Second, there is a need for stable and enriched data dictionaries that include detailed mapping logic and clinical context. Finally, the results demonstrate that iterative data quality assessment is essential to achieve high completeness and consistency.</p><p>Mapping raw health data to the OMOP CDM is a complex process requiring in-depth planning, a structured approach, and multidisciplinary collaboration to ensure high-quality outcomes. On the basis of the insights from this study, the following recommendations are proposed to address gaps identified during the transformation process:</p><list list-type="order"><list-item><p>Develop an enriched and stable data dictionary: move beyond one-to-one mappings by creating a data dictionary that includes detailed mapping logic and explanatory rationale. This enriched data dictionary should capture the clinical context of variables (eg, the underlying reason and circumstances under which a variable is captured), the structure of the source data, and any transformations applied to align with OMOP conventions. Finalizing a stable and harmonized data dictionary before initiating the project will prevent data quality issues from occurring during the transformation process.</p></list-item><list-item><p>Leverage data profiling to address variability across hospitals: systematically profile source data to understand their structure, coding practices, completeness, and variability. This includes identifying differences in data formats, coding systems (eg, <italic>ICD-10</italic> use), value distributions, and missing data patterns prior to mapping.</p></list-item><list-item><p>Strengthen expertise through training and collaboration: equip mapping teams with training programs that provide an in-depth understanding of the OMOP CDM framework, including its tables, relationships, and vocabularies [<xref ref-type="bibr" rid="ref40">40</xref>]. Encourage collaboration between medical and technical experts to ensure that mapping strategies capture both clinical accuracy and technical precision.</p></list-item><list-item><p>Automate the mapping process where possible: automate the mapping process where feasible to improve consistency and reduce manual effort, particularly for standardized variables.</p></list-item><list-item><p>Conduct data quality assessments: perform data quality assessments before and after mapping to identify inconsistencies and ensure completeness of the transformed dataset.</p></list-item></list><p>These recommendations provide a structured framework to improve mapping quality and enhance the reliability of standardized datasets for multicenter research.</p></sec><sec id="s4-5"><title>Limitations</title><p>This study has some limitations. First, no formal data quality assessment was performed after extracting raw data. Consequently, potential inaccuracies or inconsistencies in the source data may have influenced the mapping outcomes, affecting the quality of the final OMOP dataset. Second, the role and timing of medical expert involvement remain unclear. Although their expertise is crucial for interpreting raw data and ensuring consistent mappings, their absence during the creation of the data dictionary may have limited its clinical relevance. Finally, uncertainty about when to involve experts in the workflow&#x2014;whether during data interpretation, technical mapping, or both&#x2014;may have affected the consistency of the process. Addressing these limitations through postextraction quality checks and clearer integration of medical expertise can improve the trustworthiness and reliability of future mapping efforts.</p></sec><sec id="s4-6"><title>Conclusions</title><p>This study highlights the challenges of mapping lung cancer data to the OMOP CDM, particularly in managing unstructured data, addressing granularity discrepancies, and adapting to evolving reference standards. Although high consistency was achieved for structured variables, limitations in handling free-text data and incomplete mapping logic documentation revealed areas for improvement. The interplay between technical challenges and nontechnical factors, such as human expertise and system variability, highlights the need for a multidisciplinary approach to OMOP CDM implementation. Collaborations between clinical experts, data scientists, and data engineers are essential to bridge gaps in knowledge and address the complexities of transforming diverse health care data into a standardized format. Furthermore, fostering a shared understanding of the source systems across sites and aligning on best practices for mapping logic can improve the reliability of the mapped data. These insights lay the groundwork for creating harmonized datasets to support robust multicenter research and clinical analytics.</p></sec></sec></body><back><ack><p>The authors thank the Data Science Institute of Ghent University Hospital for its valuable guidance and for providing the opportunity to conduct this study. This work would not have been possible without their support.</p><p>ChatGPT (OpenAI) was used as a writing assistant to improve the language and clarity of the manuscript. The authors maintained full control over the content, verified the accuracy of all artificial intelligence&#x2013;generated suggestions, and take full responsibility for the integrity of the final manuscript.</p></ack><notes><sec><title>Funding</title><p>The authors declared that no financial support was received for this work.</p></sec></notes><fn-group><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">CDM</term><def><p>Common Data Model</p></def></def-item><def-item><term id="abb2">DSI</term><def><p>Data Science Institute</p></def></def-item><def-item><term id="abb3">DWH</term><def><p>Data Warehouse</p></def></def-item><def-item><term id="abb4">EHR</term><def><p>electronic health record</p></def></def-item><def-item><term id="abb5">ETL</term><def><p>extract, transform, and load</p></def></def-item><def-item><term id="abb6">FHIN</term><def><p>Federated Health Innovation Network</p></def></def-item><def-item><term id="abb7"><italic>ICD-10</italic></term><def><p><italic>International Classification of Diseases, 10th Revision</italic></p></def></def-item><def-item><term id="abb8">LOINC</term><def><p>Logical Observation Identifiers Names and Codes</p></def></def-item><def-item><term id="abb9">MOC</term><def><p>Multidisciplinary Oncology Consultation</p></def></def-item><def-item><term id="abb10">MZG</term><def><p>Minimale Ziekenhuisgegevens</p></def></def-item><def-item><term id="abb11">OMOP</term><def><p>Observational Medical Outcomes Partnership</p></def></def-item><def-item><term id="abb12">SNOMED CT</term><def><p>Systematized Nomenclature of Medicine&#x2013;Clinical Terms</p></def></def-item><def-item><term id="abb13">TNM</term><def><p>tumor, node, and metastasis</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Jungkunz</surname><given-names>M</given-names> </name><name name-style="western"><surname>K&#x00F6;ngeter</surname><given-names>A</given-names> </name><name name-style="western"><surname>Mehlis</surname><given-names>K</given-names> </name><name name-style="western"><surname>Winkler</surname><given-names>EC</given-names> </name><name name-style="western"><surname>Schickhardt</surname><given-names>C</given-names> </name></person-group><article-title>Secondary use of clinical data in data-gathering, non-interventional research or learning activities: definition, types, and a framework for risk assessment</article-title><source>J Med Internet Res</source><year>2021</year><month>06</month><day>8</day><volume>23</volume><issue>6</issue><fpage>e26631</fpage><pub-id pub-id-type="doi">10.2196/26631</pub-id><pub-id pub-id-type="medline">34100760</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Coorevits</surname><given-names>P</given-names> </name><name name-style="western"><surname>Sundgren</surname><given-names>M</given-names> </name><name name-style="western"><surname>Klein</surname><given-names>GO</given-names> </name><etal/></person-group><article-title>Electronic health records: new opportunities for clinical research</article-title><source>J Intern Med</source><year>2013</year><month>12</month><volume>274</volume><issue>6</issue><fpage>547</fpage><lpage>560</lpage><pub-id pub-id-type="doi">10.1111/joim.12119</pub-id><pub-id pub-id-type="medline">23952476</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Fruchart</surname><given-names>M</given-names> </name><name name-style="western"><surname>Quindroit</surname><given-names>P</given-names> </name><name name-style="western"><surname>Jacquemont</surname><given-names>C</given-names> </name><name name-style="western"><surname>Beuscart</surname><given-names>JB</given-names> </name><name name-style="western"><surname>Calafiore</surname><given-names>M</given-names> </name><name name-style="western"><surname>Lamer</surname><given-names>A</given-names> </name></person-group><article-title>Transforming primary care data into the observational medical outcomes partnership common data model: development and usability study</article-title><source>JMIR Med Inform</source><year>2024</year><month>08</month><day>13</day><volume>12</volume><fpage>e49542</fpage><pub-id pub-id-type="doi">10.2196/49542</pub-id><pub-id pub-id-type="medline">39140273</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Oja</surname><given-names>M</given-names> </name><name name-style="western"><surname>Tamm</surname><given-names>S</given-names> </name><name name-style="western"><surname>Mooses</surname><given-names>K</given-names> </name><etal/></person-group><article-title>Transforming Estonian health data to the Observational Medical Outcomes Partnership (OMOP) common data model: lessons learned</article-title><source>JAMIA Open</source><year>2023</year><month>12</month><volume>6</volume><issue>4</issue><fpage>ooad100</fpage><pub-id pub-id-type="doi">10.1093/jamiaopen/ooad100</pub-id><pub-id pub-id-type="medline">38058679</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Raman</surname><given-names>SR</given-names> </name><name name-style="western"><surname>Curtis</surname><given-names>LH</given-names> </name><name name-style="western"><surname>Temple</surname><given-names>R</given-names> </name><etal/></person-group><article-title>Leveraging electronic health records for clinical research</article-title><source>Am Heart J</source><year>2018</year><month>08</month><volume>202</volume><fpage>13</fpage><lpage>19</lpage><pub-id pub-id-type="doi">10.1016/j.ahj.2018.04.015</pub-id><pub-id pub-id-type="medline">29802975</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>von Lucadou</surname><given-names>M</given-names> </name><name name-style="western"><surname>Ganslandt</surname><given-names>T</given-names> </name><name name-style="western"><surname>Prokosch</surname><given-names>HU</given-names> </name><name name-style="western"><surname>Toddenroth</surname><given-names>D</given-names> </name></person-group><article-title>Feasibility analysis of conducting observational studies with the electronic health record</article-title><source>BMC Med Inform Decis Mak</source><year>2019</year><month>10</month><day>28</day><volume>19</volume><issue>1</issue><fpage>202</fpage><pub-id pub-id-type="doi">10.1186/s12911-019-0939-0</pub-id><pub-id pub-id-type="medline">31660955</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Puttkammer</surname><given-names>N</given-names> </name><name name-style="western"><surname>Baseman</surname><given-names>JG</given-names> </name><name name-style="western"><surname>Devine</surname><given-names>EB</given-names> </name><etal/></person-group><article-title>An assessment of data quality in a multi-site electronic medical record system in Haiti</article-title><source>Int J Med Inform</source><year>2016</year><month>02</month><volume>86</volume><fpage>104</fpage><lpage>116</lpage><pub-id pub-id-type="doi">10.1016/j.ijmedinf.2015.11.003</pub-id><pub-id pub-id-type="medline">26620698</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hribar</surname><given-names>MR</given-names> </name><name name-style="western"><surname>Read-Brown</surname><given-names>S</given-names> </name><name name-style="western"><surname>Goldstein</surname><given-names>IH</given-names> </name><etal/></person-group><article-title>Secondary use of electronic health record data for clinical workflow analysis</article-title><source>J Am Med Inform Assoc</source><year>2018</year><month>01</month><day>1</day><volume>25</volume><issue>1</issue><fpage>40</fpage><lpage>46</lpage><pub-id pub-id-type="doi">10.1093/jamia/ocx098</pub-id><pub-id pub-id-type="medline">29036581</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bernardi</surname><given-names>FA</given-names> </name><name name-style="western"><surname>Mello de Oliveira</surname><given-names>B</given-names> </name><name name-style="western"><surname>Bettiol Yamada</surname><given-names>D</given-names> </name><etal/></person-group><article-title>The minimum data set for rare diseases: systematic review</article-title><source>J Med Internet Res</source><year>2023</year><month>07</month><day>27</day><volume>25</volume><fpage>e44641</fpage><pub-id pub-id-type="doi">10.2196/44641</pub-id><pub-id pub-id-type="medline">37498666</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bernardo</surname><given-names>BM</given-names> </name><name name-style="western"><surname>Mamede</surname><given-names>HS</given-names> </name><name name-style="western"><surname>Barroso</surname><given-names>JM</given-names> </name><name name-style="western"><surname>dos Santos</surname><given-names>VM</given-names> </name></person-group><article-title>Data governance &#x0026; quality management&#x2014;innovation and breakthroughs across different fields</article-title><source>J Innov Knowl</source><year>2024</year><month>10</month><volume>9</volume><issue>4</issue><fpage>100598</fpage><pub-id pub-id-type="doi">10.1016/j.jik.2024.100598</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Declerck</surname><given-names>J</given-names> </name><name name-style="western"><surname>Kalra</surname><given-names>D</given-names> </name><name name-style="western"><surname>Vander Stichele</surname><given-names>R</given-names> </name><name name-style="western"><surname>Coorevits</surname><given-names>P</given-names> </name></person-group><article-title>Frameworks, dimensions, definitions of aspects, and assessment methods for the appraisal of quality of health data for secondary use: comprehensive overview of reviews</article-title><source>JMIR Med Inform</source><year>2024</year><month>03</month><day>6</day><volume>12</volume><fpage>e51560</fpage><pub-id pub-id-type="doi">10.2196/51560</pub-id><pub-id pub-id-type="medline">38446534</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bernardi</surname><given-names>FA</given-names> </name><name name-style="western"><surname>Alves</surname><given-names>D</given-names> </name><name name-style="western"><surname>Crepaldi</surname><given-names>N</given-names> </name><name name-style="western"><surname>Yamada</surname><given-names>DB</given-names> </name><name name-style="western"><surname>Lima</surname><given-names>VC</given-names> </name><name name-style="western"><surname>Rijo</surname><given-names>R</given-names> </name></person-group><article-title>Data quality in health research: integrative literature review</article-title><source>J Med Internet Res</source><year>2023</year><month>10</month><day>31</day><volume>25</volume><fpage>e41446</fpage><pub-id pub-id-type="doi">10.2196/41446</pub-id><pub-id pub-id-type="medline">37906223</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Goldberg</surname><given-names>SI</given-names> </name><name name-style="western"><surname>Niemierko</surname><given-names>A</given-names> </name><name name-style="western"><surname>Turchin</surname><given-names>A</given-names> </name></person-group><article-title>Analysis of data errors in clinical research databases</article-title><source>AMIA Annu Symp Proc</source><year>2008</year><month>11</month><day>6</day><access-date>2025-05-25</access-date><volume>2008</volume><fpage>242</fpage><lpage>246</lpage><comment><ext-link ext-link-type="uri" xlink:href="https://pubmed.ncbi.nlm.nih.gov/18998889/">https://pubmed.ncbi.nlm.nih.gov/18998889/</ext-link></comment><pub-id pub-id-type="medline">18998889</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Adeniran</surname><given-names>IA</given-names> </name><name name-style="western"><surname>Efunniyi</surname><given-names>CP</given-names> </name><name name-style="western"><surname>Osundare</surname><given-names>OS</given-names> </name><name name-style="western"><surname>Abhulimen</surname><given-names>AO</given-names> </name></person-group><article-title>Data-driven decision-making in healthcare: improving patient outcomes through predictive modeling</article-title><source>Int J Scholarly Res Multidiscip Studies</source><year>2024</year><volume>5</volume><issue>1</issue><fpage>059</fpage><lpage>067</lpage><pub-id pub-id-type="doi">10.56781/ijsrms.2024.5.1.0040</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wiebe</surname><given-names>N</given-names> </name><name name-style="western"><surname>Xu</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Shaheen</surname><given-names>AA</given-names> </name><name name-style="western"><surname>Eastwood</surname><given-names>C</given-names> </name><name name-style="western"><surname>Boussat</surname><given-names>B</given-names> </name><name name-style="western"><surname>Quan</surname><given-names>H</given-names> </name></person-group><article-title>Indicators of missing Electronic Medical Record (EMR) discharge summaries: a retrospective study on Canadian data</article-title><source>Int J Popul Data Sci</source><year>2020</year><month>12</month><day>11</day><volume>5</volume><issue>1</issue><fpage>1352</fpage><pub-id pub-id-type="doi">10.23889/ijpds.v5i3.1352</pub-id><pub-id pub-id-type="medline">34007880</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Madigan</surname><given-names>D</given-names> </name><name name-style="western"><surname>Ryan</surname><given-names>PB</given-names> </name><name name-style="western"><surname>Schuemie</surname><given-names>M</given-names> </name><etal/></person-group><article-title>Evaluating the impact of database heterogeneity on observational study results</article-title><source>Am J Epidemiol</source><year>2013</year><month>08</month><day>15</day><volume>178</volume><issue>4</issue><fpage>645</fpage><lpage>651</lpage><pub-id pub-id-type="doi">10.1093/aje/kwt010</pub-id><pub-id pub-id-type="medline">23648805</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Overhage</surname><given-names>JM</given-names> </name><name name-style="western"><surname>Ryan</surname><given-names>PB</given-names> </name><name name-style="western"><surname>Reich</surname><given-names>CG</given-names> </name><name name-style="western"><surname>Hartzema</surname><given-names>AG</given-names> </name><name name-style="western"><surname>Stang</surname><given-names>PE</given-names> </name></person-group><article-title>Validation of a common data model for active safety surveillance research</article-title><source>J Am Med Inform Assoc</source><year>2012</year><volume>19</volume><issue>1</issue><fpage>54</fpage><lpage>60</lpage><pub-id pub-id-type="doi">10.1136/amiajnl-2011-000376</pub-id><pub-id pub-id-type="medline">22037893</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Liaw</surname><given-names>ST</given-names> </name><name name-style="western"><surname>Guo</surname><given-names>JG</given-names> </name><name name-style="western"><surname>Ansari</surname><given-names>S</given-names> </name><etal/></person-group><article-title>Quality assessment of real-world data repositories across the data life cycle: a literature review</article-title><source>J Am Med Inform Assoc</source><year>2021</year><month>07</month><day>14</day><volume>28</volume><issue>7</issue><fpage>1591</fpage><lpage>1599</lpage><pub-id pub-id-type="doi">10.1093/jamia/ocaa340</pub-id><pub-id pub-id-type="medline">33496785</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kahn</surname><given-names>MG</given-names> </name><name name-style="western"><surname>Callahan</surname><given-names>TJ</given-names> </name><name name-style="western"><surname>Barnard</surname><given-names>J</given-names> </name><etal/></person-group><article-title>A harmonized data quality assessment terminology and framework for the secondary use of electronic health record data</article-title><source>EGEMS (Wash DC)</source><year>2016</year><volume>4</volume><issue>1</issue><fpage>1244</fpage><pub-id pub-id-type="doi">10.13063/2327-9214.1244</pub-id><pub-id pub-id-type="medline">27713905</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Liaw</surname><given-names>ST</given-names> </name><name name-style="western"><surname>Rahimi</surname><given-names>A</given-names> </name><name name-style="western"><surname>Ray</surname><given-names>P</given-names> </name><etal/></person-group><article-title>Towards an ontology for data quality in integrated chronic disease management: a realist review of the literature</article-title><source>Int J Med Inform</source><year>2013</year><month>01</month><volume>82</volume><issue>1</issue><fpage>10</fpage><lpage>24</lpage><pub-id pub-id-type="doi">10.1016/j.ijmedinf.2012.10.001</pub-id><pub-id pub-id-type="medline">23122633</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ozonze</surname><given-names>O</given-names> </name><name name-style="western"><surname>Scott</surname><given-names>PJ</given-names> </name><name name-style="western"><surname>Hopgood</surname><given-names>AA</given-names> </name></person-group><article-title>Automating electronic health record data quality assessment</article-title><source>J Med Syst</source><year>2023</year><month>02</month><day>13</day><volume>47</volume><issue>1</issue><fpage>23</fpage><pub-id pub-id-type="doi">10.1007/s10916-022-01892-2</pub-id><pub-id pub-id-type="medline">36781551</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Aerts</surname><given-names>H</given-names> </name><name name-style="western"><surname>Kalra</surname><given-names>D</given-names> </name><name name-style="western"><surname>Saez</surname><given-names>C</given-names> </name><etal/></person-group><article-title>Is the quality of hospital EHR data sufficient to evidence its ICHOM outcomes performance in heart failure? A pilot evaluation</article-title><source>Health Informatics</source><year>2021</year><volume>9</volume><issue>8</issue><fpage>e27842</fpage><pub-id pub-id-type="doi">10.2196/27842</pub-id><pub-id pub-id-type="medline">34346902</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Penning</surname><given-names>M</given-names> </name><name name-style="western"><surname>Zozus</surname><given-names>M</given-names> </name></person-group><article-title>Analysis of anesthesia screens for rule-based data quality assessment opportunities</article-title><source>Stud Health Technol Inform</source><year>2019</year><access-date>2025-05-25</access-date><volume>257</volume><fpage>473</fpage><lpage>478</lpage><comment><ext-link ext-link-type="uri" xlink:href="https://pubmed.ncbi.nlm.nih.gov/30741242/">https://pubmed.ncbi.nlm.nih.gov/30741242/</ext-link></comment><pub-id pub-id-type="medline">30741242</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Rudin</surname><given-names>RS</given-names> </name><name name-style="western"><surname>Fischer</surname><given-names>SH</given-names> </name><name name-style="western"><surname>Damberg</surname><given-names>CL</given-names> </name><etal/></person-group><article-title>Optimizing health IT to improve health system performance: a work in progress</article-title><source>Healthc (Amst)</source><year>2020</year><month>12</month><volume>8</volume><issue>4</issue><fpage>100483</fpage><pub-id pub-id-type="doi">10.1016/j.hjdsi.2020.100483</pub-id><pub-id pub-id-type="medline">33068915</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Schmidt</surname><given-names>L</given-names> </name><name name-style="western"><surname>Olorisade</surname><given-names>BK</given-names> </name><name name-style="western"><surname>McGuinness</surname><given-names>LA</given-names> </name><name name-style="western"><surname>Thomas</surname><given-names>J</given-names> </name><name name-style="western"><surname>Higgins</surname><given-names>JPT</given-names> </name></person-group><article-title>Data extraction methods for systematic review (semi)automation: a living systematic review</article-title><source>F1000Res</source><year>2021</year><volume>10</volume><fpage>401</fpage><pub-id pub-id-type="doi">10.12688/f1000research.51117.1</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Cascini</surname><given-names>F</given-names> </name><name name-style="western"><surname>Pantovic</surname><given-names>A</given-names> </name><name name-style="western"><surname>Al-Ajlouni</surname><given-names>YA</given-names> </name><name name-style="western"><surname>Puleo</surname><given-names>V</given-names> </name><name name-style="western"><surname>De Maio</surname><given-names>L</given-names> </name><name name-style="western"><surname>Ricciardi</surname><given-names>W</given-names> </name></person-group><article-title>Health data sharing attitudes towards primary and secondary use of data: a systematic review</article-title><source>EClinicalMedicine</source><year>2024</year><month>05</month><volume>71</volume><fpage>102551</fpage><pub-id pub-id-type="doi">10.1016/j.eclinm.2024.102551</pub-id><pub-id pub-id-type="medline">38533128</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kumar</surname><given-names>G</given-names> </name><name name-style="western"><surname>Basri</surname><given-names>S</given-names> </name><name name-style="western"><surname>Imam</surname><given-names>AA</given-names> </name><name name-style="western"><surname>Khowaja</surname><given-names>SA</given-names> </name><name name-style="western"><surname>Capretz</surname><given-names>LF</given-names> </name><name name-style="western"><surname>Balogun</surname><given-names>AO</given-names> </name></person-group><article-title>Data harmonization for heterogeneous datasets: a systematic literature review</article-title><source>Appl Sci (Basel)</source><year>2021</year><volume>11</volume><issue>17</issue><fpage>8275</fpage><pub-id pub-id-type="doi">10.3390/app11178275</pub-id></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="book"><person-group person-group-type="author"><collab>OHDSI</collab></person-group><source>The Book of OHDSI: Observational Health Data Sciences and Informatics</source><year>2019</year><access-date>2025-05-25</access-date><publisher-name>OHDSI</publisher-name><comment><ext-link ext-link-type="uri" xlink:href="https://books.google.co.uk/books?id=JxpnzQEACAAJ">https://books.google.co.uk/books?id=JxpnzQEACAAJ</ext-link></comment><pub-id pub-id-type="other">9781088855195</pub-id></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>McHugh</surname><given-names>ML</given-names> </name></person-group><article-title>Interrater reliability: the Kappa statistic</article-title><source>Biochem Med (Zagreb)</source><year>2012</year><access-date>2025-05-25</access-date><volume>22</volume><issue>3</issue><fpage>276</fpage><lpage>282</lpage><comment><ext-link ext-link-type="uri" xlink:href="https://pubmed.ncbi.nlm.nih.gov/23092060/">https://pubmed.ncbi.nlm.nih.gov/23092060/</ext-link></comment><pub-id pub-id-type="medline">23092060</pub-id></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>De Vries</surname><given-names>H</given-names> </name><name name-style="western"><surname>Elliott</surname><given-names>MN</given-names> </name><name name-style="western"><surname>Kanouse</surname><given-names>DE</given-names> </name><name name-style="western"><surname>Teleki</surname><given-names>SS</given-names> </name></person-group><article-title>Using pooled Kappa to summarize interrater agreement across many items</article-title><source>Field methods</source><year>2008</year><month>08</month><volume>20</volume><issue>3</issue><fpage>272</fpage><lpage>282</lpage><pub-id pub-id-type="doi">10.1177/1525822X08317166</pub-id></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="web"><source>Belgian Cancer Registry</source><year>2024</year><access-date>2026-05-27</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://kankerregister.org/nl">https://kankerregister.org/nl</ext-link></comment></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="web"><article-title>Koninklijk besluit betreffende de vaststelling en de vereffening van het budget van financi&#x00EB;le middelen van de ziekenhuizen</article-title><source>Belgian Federal Government Services</source><year>2002</year><access-date>2025-05-25</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.ejustice.just.fgov.be/eli/besluit/2002/04/25/2002022335/justel">https://www.ejustice.just.fgov.be/eli/besluit/2002/04/25/2002022335/justel</ext-link></comment></nlm-citation></ref><ref id="ref33"><label>33</label><nlm-citation citation-type="web"><article-title>RADar-azdelta/keun</article-title><source>GitHub</source><access-date>2025-05-25</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://github.com/RADar-AZDelta/Keun">https://github.com/RADar-AZDelta/Keun</ext-link></comment></nlm-citation></ref><ref id="ref34"><label>34</label><nlm-citation citation-type="web"><article-title>RADar-azdelta/rabbit-in-a-blender</article-title><source>GitHub</source><access-date>2025-05-25</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://github.com/RADar-AZDelta/Rabbit-in-a-Blender">https://github.com/RADar-AZDelta/Rabbit-in-a-Blender</ext-link></comment></nlm-citation></ref><ref id="ref35"><label>35</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Biedermann</surname><given-names>P</given-names> </name><name name-style="western"><surname>Ong</surname><given-names>R</given-names> </name><name name-style="western"><surname>Davydov</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Standardizing registry data to the OMOP Common Data Model: experience from three pulmonary hypertension databases</article-title><source>BMC Med Res Methodol</source><year>2021</year><month>11</month><day>2</day><volume>21</volume><issue>1</issue><fpage>238</fpage><pub-id pub-id-type="doi">10.1186/s12874-021-01434-3</pub-id><pub-id pub-id-type="medline">34727871</pub-id></nlm-citation></ref><ref id="ref36"><label>36</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ehsani-Moghaddam</surname><given-names>B</given-names> </name><name name-style="western"><surname>Martin</surname><given-names>K</given-names> </name><name name-style="western"><surname>Queenan</surname><given-names>JA</given-names> </name></person-group><article-title>Data quality in healthcare: a report of practical experience with the Canadian Primary Care Sentinel Surveillance Network data</article-title><source>Health Inf Manag</source><year>2021</year><volume>50</volume><issue>1-2</issue><fpage>88</fpage><lpage>92</lpage><pub-id pub-id-type="doi">10.1177/1833358319887743</pub-id><pub-id pub-id-type="medline">31805788</pub-id></nlm-citation></ref><ref id="ref37"><label>37</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kent</surname><given-names>S</given-names> </name><name name-style="western"><surname>Burn</surname><given-names>E</given-names> </name><name name-style="western"><surname>Dawoud</surname><given-names>D</given-names> </name><etal/></person-group><article-title>Common problems, common data model solutions: evidence generation for health technology assessment</article-title><source>Pharmacoeconomics</source><year>2021</year><month>03</month><volume>39</volume><issue>3</issue><fpage>275</fpage><lpage>285</lpage><pub-id pub-id-type="doi">10.1007/s40273-020-00981-9</pub-id><pub-id pub-id-type="medline">33336320</pub-id></nlm-citation></ref><ref id="ref38"><label>38</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sedlakova</surname><given-names>J</given-names> </name><name name-style="western"><surname>Daniore</surname><given-names>P</given-names> </name><name name-style="western"><surname>Horn Wintsch</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Challenges and best practices for digital unstructured data enrichment in health research: a systematic narrative review</article-title><source>PLOS Digit Health</source><year>2023</year><month>10</month><volume>2</volume><issue>10</issue><fpage>e0000347</fpage><pub-id pub-id-type="doi">10.1371/journal.pdig.0000347</pub-id><pub-id pub-id-type="medline">37819910</pub-id></nlm-citation></ref><ref id="ref39"><label>39</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Syed</surname><given-names>R</given-names> </name><name name-style="western"><surname>Eden</surname><given-names>R</given-names> </name><name name-style="western"><surname>Makasi</surname><given-names>T</given-names> </name><etal/></person-group><article-title>Digital health data quality issues: systematic review</article-title><source>J Med Internet Res</source><year>2023</year><month>03</month><day>31</day><volume>25</volume><fpage>e42615</fpage><pub-id pub-id-type="doi">10.2196/42615</pub-id><pub-id pub-id-type="medline">37000497</pub-id></nlm-citation></ref><ref id="ref40"><label>40</label><nlm-citation citation-type="book"><person-group person-group-type="author"><collab>Observational Health Data Sciences and Informatics</collab></person-group><source>The Book of OHDSI</source><year>2021</year><access-date>2026-05-28</access-date><publisher-name>Observational Health Data Sciences and Informatics</publisher-name><comment><ext-link ext-link-type="uri" xlink:href="https://ohdsi.github.io/TheBookOfOhdsi/">https://ohdsi.github.io/TheBookOfOhdsi/</ext-link></comment></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>SQL scripts using the vocabularies and relationships defined in the Observational Medical Outcomes Partnership Common Data Model version 5.4.</p><media xlink:href="jmir_v28i1e90246_app1.docx" xlink:title="DOCX File, 18 KB"/></supplementary-material></app-group></back></article>