<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">J Med Internet Res</journal-id><journal-id journal-id-type="publisher-id">jmir</journal-id><journal-id journal-id-type="index">1</journal-id><journal-title>Journal of Medical Internet Research</journal-title><abbrev-journal-title>J Med Internet Res</abbrev-journal-title><issn pub-type="epub">1438-8871</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v27i1e64628</article-id><article-id pub-id-type="doi">10.2196/64628</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Implications of Data Extraction and Processing of Electronic Health Records for Epidemiological Research: Observational Study</article-title></title-group><contrib-group><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>van Essen</surname><given-names>Melissa H J</given-names></name><degrees>MSc</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Twickler</surname><given-names>Robin</given-names></name><degrees>BSc</degrees><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Weesie</surname><given-names>Yvette M</given-names></name><degrees>MSc</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Arslan</surname><given-names>Ilgin G</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Groenhof</surname><given-names>Feikje</given-names></name><degrees>MSc</degrees><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Peters</surname><given-names>Lilian L</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Bos</surname><given-names>Isabelle</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Verheij</surname><given-names>Robert A</given-names></name><degrees>Prof Dr</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref><xref ref-type="aff" rid="aff4">4</xref></contrib></contrib-group><aff id="aff1"><institution>Tranzo, School of Social Sciences and Behavioural Research, Tilburg University</institution><addr-line>Reitse Poort 1, RP126, Professor Cobbenhagenlaan 125</addr-line><addr-line>Tilburg</addr-line><country>The Netherlands</country></aff><aff id="aff2"><institution>Nivel, Netherlands Institute for Health Services Research</institution><addr-line>Utrecht</addr-line><country>The Netherlands</country></aff><aff id="aff3"><institution>Department of General Practice and Elderly Care Medicine, University Medical Centre Groningen</institution><addr-line>Groningen</addr-line><country>The Netherlands</country></aff><aff id="aff4"><institution>National Health Care Institute</institution><addr-line>Diemen</addr-line><country>The Netherlands</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Sarvestan</surname><given-names>Javad</given-names></name></contrib><contrib contrib-type="editor"><name name-style="western"><surname>Leung</surname><given-names>Tiffany</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Senst</surname><given-names>Benjamin</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Tribby</surname><given-names>Calvin P</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Boven</surname><given-names>Kees van</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Goyal</surname><given-names>Shreya</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Melissa H J van Essen, MSc, Tranzo, School of Social Sciences and Behavioural Research, Tilburg University, Reitse Poort 1, RP126, Professor Cobbenhagenlaan 125, Tilburg, The Netherlands, 31 631978419; <email>m.h.j.vanessen@tilburguniversity.edu</email></corresp></author-notes><pub-date pub-type="collection"><year>2025</year></pub-date><pub-date pub-type="epub"><day>11</day><month>6</month><year>2025</year></pub-date><volume>27</volume><elocation-id>e64628</elocation-id><history><date date-type="received"><day>22</day><month>07</month><year>2024</year></date><date date-type="rev-recd"><day>06</day><month>03</month><year>2025</year></date><date date-type="accepted"><day>06</day><month>03</month><year>2025</year></date></history><copyright-statement>&#x00A9; Melissa H J van Essen, Robin Twickler, Yvette M Weesie, Ilgin G Arslan, Feikje Groenhof, Lilian L Peters, Isabelle Bos, Robert A Verheij. Originally published in the Journal of Medical Internet Research (<ext-link ext-link-type="uri" xlink:href="https://www.jmir.org">https://www.jmir.org</ext-link>), 11.6.2025. </copyright-statement><copyright-year>2025</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in the Journal of Medical Internet Research (ISSN 1438-8871), is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://www.jmir.org/">https://www.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://www.jmir.org/2025/1/e64628"/><abstract><sec><title>Background</title><p>The use of routinely recorded electronic health record (EHR) data is increasingly common, especially in epidemiological research. However, data must be processed and prepared for secondary use, and decisions made during this process could significantly impact research outcomes. A demonstration of the extent of these consequences is necessary.</p></sec><sec><title>Objective</title><p>The aim of this study was to investigate the influence of data processing steps on research outcomes derived from the secondary use of EHR data.</p></sec><sec sec-type="methods"><title>Methods</title><p>EHR data from 8 Dutch general practices from 2019 were used. These practices contributed data to 2 research databases: the Academic General Practitioner Development Network registry and the Nivel Primary Care Database. Data were extracted and processed through distinct extraction, transformation, and loading (ETL) pipelines, allowing the evaluation of the impact of different ETL methods by comparing the 2 datasets in three steps: (1) patient demographics, (2) epidemiology of concordant patients, and (3) health service use of patients with 3 diagnoses. A number of similarity indicators, including the number of contacts, regular consultations and visits, prescriptions, and episodes, were compared between the 2 databases. The outcomes were compared by performing paired samples <italic>t</italic> tests using 99% CIs. Prevalence, number of prescriptions, and number of regular consultations and visits per 1000 patient years were calculated and compared for 3 diagnoses (diabetes mellitus, urinary tract infection, and cough). These outcomes were compared using the SD.</p></sec><sec sec-type="results"><title>Results</title><p>Differences were observed between the datasets in the number of enrolled patients (Academic General Practitioner Development Network registry: n=47,517; Nivel Primary Care Database: n=44,247). Despite this, patient demographics were similar. All indicator outcomes of the concordant patients showed significant differences between the databases, that is, the number of contacts, prescriptions, and episodes per patient, and the number of regular consultations and visits. Differences in the indicator outcomes for the 3 diagnosis groups varied greatly in SD, however, none of the differences were deemed significant.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>The findings highlight the importance of routine health data users&#x2019; awareness of different ETL steps involved. Transparency and shared knowledge about these processes are critical, and making them available for research is necessary. Data processors should share their knowledge regarding their choices, and researchers and policy makers should invest in their knowledge of this type of metadata. Transparency and shared knowledge are particularly important in light of the European Health Data Space and the ever-increasing secondary use of routinely recorded health data. Future research should focus on the role of transparency, joint decision-making, and the minimization of effects of ETL steps, and on the insight into the individual influence of ETL steps on research outcomes. This could stimulate standardized approaches among data processors and researchers, resulting in increased data interoperability.</p></sec></abstract><kwd-group><kwd>routine health care data</kwd><kwd>electronic health records</kwd><kwd>general practice</kwd><kwd>data governance</kwd><kwd>data processing</kwd><kwd>data quality</kwd><kwd>fitness for purpose</kwd><kwd>data extraction</kwd><kwd>ETL</kwd><kwd>extraction, transformation, and loading</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>Secondary use of routine health care data is progressively more common, such as the use of electronic health records (EHRs) for research and policy making [<xref ref-type="bibr" rid="ref1">1</xref>-<xref ref-type="bibr" rid="ref4">4</xref>]. EHR data are frequently used to report the incidence, prevalence, and health service use [<xref ref-type="bibr" rid="ref3">3</xref>,<xref ref-type="bibr" rid="ref5">5</xref>-<xref ref-type="bibr" rid="ref10">10</xref>]. For example, during the COVID-19 pandemic, EHR data facilitated the monitoring of disease spread and health service use [<xref ref-type="bibr" rid="ref11">11</xref>]. In the Netherlands, general practice EHR data play an important role in research and policy making. Additionally, these valuable data can be used for quality improvement goals without imposing any additional administrative burden on health care professionals [<xref ref-type="bibr" rid="ref12">12</xref>,<xref ref-type="bibr" rid="ref13">13</xref>]. From a European perspective, there have been significant developments such as the establishment of a European Health Data Space (EHDS), which should facilitate secondary use of routine health care data [<xref ref-type="bibr" rid="ref14">14</xref>,<xref ref-type="bibr" rid="ref15">15</xref>]. Overall, much good is expected from the developments taking place regarding routine health care data.</p><p>At the same time, there is an ongoing debate about the fitness for the purpose of EHR data for secondary use [<xref ref-type="bibr" rid="ref1">1</xref>]. Studies have concluded EHR data to be accurate and reliable for the identification of symptoms and diseases, and to be informative of health care consumption rates, suggesting no further verification is needed prior to the secondary use of these data [<xref ref-type="bibr" rid="ref16">16</xref>,<xref ref-type="bibr" rid="ref17">17</xref>]. Additionally, studies agree on the value of EHR data and the broad range of purposes that EHR data could be used for [<xref ref-type="bibr" rid="ref1">1</xref>,<xref ref-type="bibr" rid="ref13">13</xref>,<xref ref-type="bibr" rid="ref16">16</xref>,<xref ref-type="bibr" rid="ref17">17</xref>]. Others caution against the potential introduction of various types of bias, such as selection bias [<xref ref-type="bibr" rid="ref1">1</xref>,<xref ref-type="bibr" rid="ref4">4</xref>]. Since the primary design of EHR data is to record individual patient care as part of the health care process, the fitness for the purpose of this data for secondary use requires careful consideration. Recent research emphasizes the importance of the concept of fitness for purpose and fitness for use, respectively: data serving intended decision-making functions and the ability to get the right information, into the right hands at the right time [<xref ref-type="bibr" rid="ref10">10</xref>,<xref ref-type="bibr" rid="ref18">18</xref>].</p><p>Data quality is influenced by factors such as variations in health care professionals&#x2019; recording habits caused by high administrative workloads [<xref ref-type="bibr" rid="ref1">1</xref>,<xref ref-type="bibr" rid="ref19">19</xref>], inconsistent reporting guidelines, including guidelines for language use in EHR, and variations in EHR software, which can affect morbidity estimates [<xref ref-type="bibr" rid="ref4">4</xref>,<xref ref-type="bibr" rid="ref5">5</xref>,<xref ref-type="bibr" rid="ref20">20</xref>,<xref ref-type="bibr" rid="ref21">21</xref>]. Additionally, data governance decisions made during the extraction, transformation, and loading (ETL) process&#x2014;including data preparation and cleaning&#x2014;contribute to variations in research datasets [<xref ref-type="bibr" rid="ref1">1</xref>]. Verheij et al [<xref ref-type="bibr" rid="ref1">1</xref>] visualized the data flow from EHR systems to research datasets into distinct zones (ie, care zone, database zone, and research zone) (<xref ref-type="fig" rid="figure1">Figure 1</xref>). Each zone includes the underlying actions (ie, recording in EHR, extracting data, preparing data for research) and actors (ie, physician, database manager, researcher) that contribute to the introduction of potential biases. While efforts have been made to optimize EHR data for secondary purposes and to limit potential bias, such as confounding bias and selection bias, most studies have focused on the completeness of the data, in the &#x201C;research zone&#x201D; [<xref ref-type="bibr" rid="ref5">5</xref>,<xref ref-type="bibr" rid="ref19">19</xref>,<xref ref-type="bibr" rid="ref22">22</xref>,<xref ref-type="bibr" rid="ref23">23</xref>]. Hence, the impact of different ETL methods in the &#x201C;database zone&#x201D; on data quality remains underexplored [<xref ref-type="bibr" rid="ref1">1</xref>,<xref ref-type="bibr" rid="ref4">4</xref>], leaving stakeholders unaware of potential biases in processing routines.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Steps and actors involved in the data flow between the delivery of care and applications reusing the data from Verheij et al [<xref ref-type="bibr" rid="ref1">1</xref>]. EHR: electronic health record.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v27i1e64628_fig01.png"/></fig><p>Therefore, the aim of this study is to investigate the influence of ETL steps involved in the secondary use of general practice EHR data on research outcomes, between 2 different EHR-research databases encompassing data from the same 8 general practices. Using a selection of indicators representative of epidemiological and health services use studies, the difference between the datasets from the 2 respective databases will be investigated. Due to the many differentiating steps within these ETL processes, we expect the indicator outcomes to be different, and to demonstrate the extent of the differences, potentially depending on the extent of the data processing. The results of this study will provide valuable insight into the extent of differentiation between EHR databases and possibly contribute to the awareness of routine health data users of the effects of ETL processes.</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Design</title><p>This observational study was conducted in the context of the FAIR work packages of 3 larger COVID-19-related collaborations: COVID-GP [<xref ref-type="bibr" rid="ref2">2</xref>,<xref ref-type="bibr" rid="ref3">3</xref>,<xref ref-type="bibr" rid="ref24">24</xref>], Long COVID Mixed Methods [<xref ref-type="bibr" rid="ref25">25</xref>], and GRIP-3 [<xref ref-type="bibr" rid="ref26">26</xref>,<xref ref-type="bibr" rid="ref27">27</xref>]. For these projects, pseudonymized EHR data from general practices were stored and combined on the data platform of Statistics Netherlands. For this study, data from 8 general practices were used, covering the period from January 1 to December 31, 2019. The use of these data allowed for the unique opportunity to evaluate the impact of specific choices made during the different data extraction and processing methods (ie, database zone steps) by comparing the 2 datasets in a three-step approach: (1) patient demographics, (2) epidemiology of concordant patients, and (3) health service use in 3 diagnosis groups.</p></sec><sec id="s2-2"><title>Ethical Considerations</title><p>Ethical approval for this study was waived by the medical ethics committee of the University Medical Centre Groningen (reference number: 2020/309). The use of EHR data is permitted under certain conditions by Dutch law, both for the data from the general practice registration network (Academic General Practitioner Development Network; AHON) and Nivel Primary Care Database (Nivel-PCD). According to this legislation, neither obtaining informed consent from patients nor approval by a medical ethics committee is obligatory for these types of observational studies that contain no directly identifiable patient data (art. 24 GDPR Implementation Act jo art. 9.2 sub j GDPR). For Nivel-PCD, the project has been approved by the relevant governance bodies of Nivel-PCD under number NZR-00320.087. As mentioned, the EHR data used in this study were pseudonymized before analysis.</p></sec><sec id="s2-3"><title>Databases</title><p>The datasets used for this study originate from EHR data provided by general practices for 2 research databases: the AHON registry [<xref ref-type="bibr" rid="ref28">28</xref>] and the Nivel-PCD [<xref ref-type="bibr" rid="ref29">29</xref>]. The aim of these registries is to provide insights into epidemiology and health care provided in general practices in the Netherlands. The datasets contain pseudonymized EHR data from 56 general practices participating in the AHON registry, located in the North of the Netherlands (approved under number 2020/309) and 363 general practices participating in the Nivel-PCD, located across all regions in the Netherlands (approved under number NZR-00320.087). Eight of these 56 general practices contributed to both databases. The structured data from these shared practices was used to compare research outcomes for the distinctly processed research datasets of the AHON registry and Nivel-PCD.</p></sec><sec id="s2-4"><title>Data Extraction and Processing Pipelines</title><p>The AHON registry receives EHR data from a third party that extracts and processes the data. Nivel-PCD receives extracted EHR from the EHR system provider of the general practitioner (GP), after pseudonymization by a third party. This process follows extraction specifications formulated by the AHON registry and Nivel-PCD, respectively, a distinct step in the ETL process for the 2 research databases. The translation into the registry and preparation of the dataset for the researchers involves distinct steps and choices as well. <xref ref-type="other" rid="box1">Textbox 1</xref> provides an explanation of the differences in processing of each variable between the AHON registry and Nivel-PCD, and the zones in which processing takes place.</p><boxed-text id="box1"><title> Variables used for analyses (for the Academic General Practitioner Development Network [AHON] registry and Nivel Primary Care Database [Nivel-PCD]): definitions and processing zones.</title><p><bold>Care Zone Variables:</bold></p><p><bold>International Classification of Primary Care (ICPC) Code</bold></p><list list-type="bullet"><list-item><p>Diagnosis codes are based on the ICPC-1 coding system. [<xref ref-type="bibr" rid="ref30">30</xref>] These codes have been recorded in the electronic health record (EHR) by the general practitioner (GP), and are linked to prescriptions, contacts or actions performed by the GP.</p></list-item><list-item><p><bold>AHON registry:</bold> ICPC codes as recorded by the GP.</p></list-item><list-item><p><bold>Nivel-PCD:</bold> ICPC codes as recorded by the GP.</p></list-item></list><p><bold>Anatomical Therapeutic Chemical (ATC) Code</bold></p><list list-type="bullet"><list-item><p>Prescription codes are based on the ATC classification system for the recording of medication. [<xref ref-type="bibr" rid="ref31">31</xref>] ATC-codes are recorded in the EHR, by either the GP or via feedback from the pharmacist.</p></list-item><list-item><p><bold>AHON registry:</bold> ATC-codes as recorded by the GP. No feedback from different health care providers</p></list-item><list-item><p><bold>Nivel-PCD:</bold> ATC-codes recorded in the GP system. Originating from recordings of GPs or from feedback provided by pharmacies. Distinguishing between the 2 sources is not feasible.</p></list-item></list><p><bold>Episodes</bold></p><list list-type="bullet"><list-item><p>Depending on the EHR system, the end of an episode is automatically recorded in the EHR or this is done manually by the GP. Symptoms or comorbidities can be linked to an episode with the corresponding ICPC code.</p></list-item><list-item><p><bold>AHON registry:</bold> Based on the episodes of care as recorded in the EHR.</p></list-item><list-item><p><bold>Nivel-PCD:</bold> Not applicable: See &#x201C;Episode-construct&#x201D; under Database Zone Variables within this textbox.</p></list-item></list><p><bold>Database Zone Variables:</bold></p><p><bold>Registration quarter</bold></p><list list-type="bullet"><list-item><p>The yearly quarter a patient was enrolled at the general practice, based on capitation fees that are recorded on a quarterly basis for each patient that is enrolled in the practice during that quarter. The datasets in this study contain information on enrolled patients in 2019, and as such the maximum number of registration quarters per patient is 4.</p></list-item><list-item><p><bold>AHON registry:</bold> Registration duration is based on the date of enrollment of a patient. Only fully registered quarters are included. When a patient enrolls halfway through the quarter, the registration will start from the next quarter.</p></list-item><list-item><p><bold>Nivel-PCD:</bold> Registration duration is determined by capitation fee records. Only patients enrolled for a full quarter are included. Mid-quarter registrations start the following quarter. Missing quarters between first and last registrations are imputed.</p></list-item></list><p><bold>Pseudonymized patient identification number</bold></p><list list-type="bullet"><list-item><p>A unique number assigned to each patient in a dataset stored on the data platform of Statistics Netherlands. This pseudonymized identifier allows patient information to be linked to other datasets available on the data platform.</p></list-item><list-item><p><bold>AHON registry</bold>: For patients uploaded to the data platform of Statistics Netherlands this is based on 3 numbers of the postal code (PC3), year of birth, and sex of the patient.</p></list-item><list-item><p><bold>Nivel-PCD</bold>: Based on the social security number of the patient. For patients uploaded to the data platform of Statistics Netherlands as well as the usual datasets.</p></list-item></list><p><bold>Prescriptions</bold></p><list list-type="bullet"><list-item><p>In this study defined as prescribed medications, based on the record of a unique ATC-code on a unique date.</p></list-item><list-item><p><bold>AHON registry</bold>: Contains all medications prescribed by the GP. Including repeat prescriptions.</p></list-item><list-item><p><bold>Nivel-PCD</bold>: Contains all medications recorded in the EHR prescribed by the GP. Including repeat prescriptions, or by a different health care provider, see &#x201C;ATC-code&#x201D; under care zone within this textbox. Prescriptions are deduplicated in case of a double record within 8 days, for example in case of a record by the GP and a record by the pharmacist.</p></list-item></list><p><bold>Insurance claims codes</bold></p><list list-type="bullet"><list-item><p>Based on the insurance claims database (Vektis) classification system designed to record and invoice all activities of the GP [<xref ref-type="bibr" rid="ref32">32</xref>]. It can be further divided into activities during practice consultations, home visits, other contacts, and capitation fee records. Activities can be linked to an ICPC code on the same day by the data processor, and used to classify and invoice actions such as interventions performed in the general practice, thus providing insight into the invoiced activities of a GP. Insurance claims codes are recorded in the care zone, and processed in the database zone.</p></list-item><list-item><p><bold>AHON registry:</bold> All insurance claims codes as recorded by the GP are included. Including the code for recording the capitation fees of a patient each quarter.</p></list-item><list-item><p><bold>Nivel-PCD:</bold> All insurance claims codes as recorded by the GP are received, however, when a dataset has been requested by the researcher, the codes are filtered by the data processor to include a selection of codes relevant to the study (in agreement with the researchers), on a database zone level.</p></list-item></list><p><bold>Contacts</bold></p><list list-type="bullet"><list-item><p>Defined as moments of contact between a GP and a patient. Based on unique dates on which an insurance claim code was recorded by the GP, that is, the maximum number of contacts per patient per day is 1. Insurance claims codes are a classification system designed to record all actions of the GP, and can be further divided into practice consultations, home visits and other contacts. The recording of ICPC codes and ATC-codes can be linked to contacts by the data processor, based on date.</p></list-item><list-item><p><bold>AHON registry</bold>: Contains all insurance claims codes present in the general practice EHR system. Including interventions carried out by GP practice support.</p></list-item><list-item><p><bold>Nivel-PCD:</bold> Contains a selection of insurance claims codes relevant to the research.</p></list-item></list><p><bold>Episode-construct</bold></p><list list-type="bullet"><list-item><p>An adaptation of the recorded episodes of care as recorded in the EHR by the GP. EHR data of the current year and the 2 prior years are used to construct the episode. A diagnosis is labeled an episode of illness from the date of diagnosis to the last encounter plus half of the duration of the contact-free interval. [<xref ref-type="bibr" rid="ref33">33</xref>]</p></list-item><list-item><p><bold>AHON registry:</bold> See &#x201C;episodes&#x201D; under care zone within this textbox.</p></list-item><list-item><p><bold>Nivel-PCD:</bold> Episodes are based on the episode construct. The Nivel-PCD thus actively enters an &#x201C;end-date&#x201D; for certain episodes based on this construct, independently of the recording of the GP. Within this construct, when a symptom, such as coughing, is recorded under an episode such as asthma, the symptom will be overruled by the episode.</p></list-item></list><p><bold>Research Zone Variables:</bold></p><p><bold>Patient year</bold></p><list list-type="bullet"><list-item><p>Duration of the year that a patient was registered at a general practice. Calculated based on registration quarters (1-4), thus the minimum number of patient years per patient is 0.25, and the maximum number of patient years per patient is 1. Patient years are calculated by the researcher.</p></list-item><list-item><p><bold>AHON registry:</bold> Operationalization is conducted following the description using registration quarters and pseudonymized patient identification numbers.</p></list-item><list-item><p><bold>Nivel-PCD:</bold> Operationalization is conducted following the description using registration quarters and pseudonymized patient identification numbers.</p></list-item></list><p><bold>Prevalence rate</bold></p><list list-type="bullet"><list-item><p>The total number of patients with a disease existing in the population, in this study per 1000 patient years. The corresponding formula is the number of patients with the record of the ICPC diagnosis code/number of patient years of the population * 1000. The ICPC codes used for this calculation are contact-ICPC codes. The maximum number of disease cases per ICPC code is 1 per patient.</p></list-item><list-item><p><bold>AHON registry:</bold> Operationalization is conducted following the description using ICPC codes and patient years.</p></list-item><list-item><p><bold>Nivel-PCD:</bold> Operationalization is conducted following the description using ICPC codes and patient years.</p></list-item></list><p><bold>Regular consultations and visits</bold></p><list list-type="bullet"><list-item><p>A subselection of contacts as described under &#x201C;Database zone.&#x201D; The subselection is based on insurance claims codes representing regular consultations and visits [<xref ref-type="bibr" rid="ref32">32</xref>], and is identical for AHON registry and Nivel-PCD, namely:12001 &#x2013; regular consultation, &#x003E; 20 minutes12002 &#x2013; regular visit, &#x003C; 20 minutes12003 &#x2013; regular visit, &#x003E; 20 minutes12010 &#x2013; regular consultation, &#x003C; 5 minutes12011 &#x2013; regular consultation, &#x003E; 5 minutes &#x003C; 20 minutes</p></list-item><list-item><p><bold>AHON registry:</bold> All subselected insurance claims codes are present in the AHON registry dataset, and no further data processing takes place on these codes.</p></list-item><list-item><p><bold>Nivel-PCD:</bold> All subselected insurance claims codes are present in the Nivel-PCD dataset, and no further data processing takes place on these codes.</p></list-item></list></boxed-text></sec><sec id="s2-5"><title>Population</title><p>The population consisted of all individuals enrolled as patients for at least one quarter in one of the 8 shared general practices. A subgroup of concordant patients was used for part of the analyses. The concordant patient group comprised patients present in both databases (41,857/49,907, 83.9%), based on their identical identification numbers as assigned by Statistics Netherlands. This identification number was derived from a pseudonym of the social security number for patients in the Nivel-PCD and, for patients in the AHON registry, a pseudonym of a combination of 3 digits of the postal code (PC3), year of birth, and sex. The concordant patient group was the focus of the second step of our 3-step approach,&#x2014;the epidemiology of concordant patients&#x2014;enabling an accurate assessment of data similarity between patients in the 2 research datasets. Nonconcordant patients were excluded to avoid automatically skewed results. The total group of patients&#x2014;all those present in the databases&#x2014;was used for the first and third steps, namely the analyses of patient demographics and health service use in 3 diagnosis groups. This approach minimized selection bias, as research on health service use conducted with EHR data usually does not allow for the filtering of such patients. <xref ref-type="fig" rid="figure2">Figure 2</xref> provides a complete flowchart of the population inclusion.</p><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>Flowchart of patients included in the study population. AHON: Academic General Practitioner Development Network (Academische Huisartsen Ontwikkel Netwerk); EHR: electronic health record; Nivel-PCD: Nivel Primary Care Database.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v27i1e64628_fig02.png"/></fig></sec><sec id="s2-6"><title>Indicators of Similarity</title><p>The datasets include data on patient demographics, including age, sex, postal code, and registration quarter, as well as information on the number, type, and reason of contacts, including consultations, visits, and other interventions. Diagnoses or symptoms, along with the number of and indications for prescriptions on patient-level, were included as well. Information on diagnoses included the International Classification of Primary Care (ICPC)-1 codes and information on prescriptions included the Anatomical Therapeutic Chemical codes [<xref ref-type="bibr" rid="ref30">30</xref>,<xref ref-type="bibr" rid="ref31">31</xref>]. For consultations, visits, and other interventions, the insurance claims codes were included, which were used to record and invoice all activities of the GP, along with corresponding dates. These data are provided in different modules as follows: patient table, contacts table, interventions table, and prescriptions table. Each variable within these tables represents a record by the GP.</p><p>Indicators were operationalized differently in each database based on requirements set by the principal investigators of the main project as preparation for the researchers (<xref ref-type="fig" rid="figure1">Figure 1</xref>). Definitions and operationalization of variables or records, as well as the &#x201C;zone&#x201D; in which the records were processed, are detailed in <xref ref-type="other" rid="box1">Textbox 1</xref>. When records were processed differently, an explanation of the processing step is included for the relevant database, that is, AHON registry or Nivel-PCD. Variables from the &#x201C;research zone&#x201D; (<xref ref-type="other" rid="box1">Textbox 1</xref>), such as patient years, prevalence rate, and regular consultations and visits, were operationalized in identical ways, as explained in <xref ref-type="other" rid="box1">Textbox 1</xref>. <xref ref-type="fig" rid="figure3">Figure 3</xref> provides a schematic overview of the connections between the variables and the processing zones. Variables are placed in blue fields representing the actor responsible for producing or processing them, with arrows indicating the relationship between the variables. The further down a variable is placed in the overview, the more that variable has been processed.</p><fig position="float" id="figure3"><label>Figure 3.</label><caption><p>Schematic overview of the variables and their relationship, and origin. ATC: Anatomical Therapeutic Chemical; EHR: electronic health record; ICPC: International Classification of Primary Care; Nivel-PCD: Nivel Primary Care Database.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v27i1e64628_fig03.png"/></fig></sec><sec id="s2-7"><title>Analyses</title><sec id="s2-7-1"><title>Overview</title><p>Demographic analyses, analyses on the epidemiology of concordant patients, and analyses on the health service use of patients in 3 diagnosis groups were performed on the Statistics Netherlands remote access platform, where both datasets were uploaded and stored.</p></sec><sec id="s2-7-2"><title>Step 1: Demographics</title><p>Demographic analyses were performed on all patients present in the databases. The total number of unique patients based on the identification number, the total number of patient years, the mean number of patients per practice, the number of patients by age category (0&#x2010;4, 5&#x2010;17, 18&#x2010;64, and 65+ y), and sex were calculated. These demographic analyses provided insights into potential differences between the 2 populations and contextualized a relevant perspective on the outcomes of 3, the health service use analyses in 3 diagnosis groups. Due to the differences in pseudonymization methods and registration quarters processing methods, a minor difference in the demographics of the AHON study population and the Nivel-PCD study population was anticipated. For a definition of patient years and all other variables used for the analyses, see <xref ref-type="other" rid="box1">Textbox 1</xref>.</p></sec><sec id="s2-7-3"><title>Step 2: Epidemiology of Concordant Patients</title><p>To identify differences present on a patient level, we analyzed the similarity of data present between the 2 datasets for the concordant patient group. The use of the concordant patient group ensured that detection bias was minimized and differences in outcome measures present were not overestimated due to population discrepancies. We compared the mean and SD of similarity indicators present in both datasets at the patient level: the number of contacts, number of regular consultations and visits, number of prescriptions, and number of episodes. For these analyses, we merged the available indicators based on the mutual identification number of the patient and compared the mean and SD for each indicator by performing paired samples <italic>t</italic> tests and calculating the corresponding 99% CIs, that is, the confidence level was set to 99% due to the large number of data. A larger difference was expected in the number of contacts and episodes, due to the more extensive data processing that took place for these variables (<xref ref-type="other" rid="box1">Textbox 1</xref>), especially due to the presence of the episode construct for Nivel-PCD episodes. For the number of regular consultations and visits, we expected no significant differences as the data extraction and processing on these insurance claims codes were largely identical for the 2 databases. Analyzing the health care consumption of patients based on a subselection of relevant insurance claims codes, as opposed to a nonspecific selection, that is, all insurance claims codes, is more representative of research conducted with EHR data, as this is a more commonly used method among researchers.</p></sec><sec id="s2-7-4"><title>Step 3: Analyses of the Health Service Use in 3 Diagnosis Groups</title><p>Subsequently, a set of indicators was selected to provide a way to compare the research outcomes of both datasets for different diagnoses. The purpose of these analyses is to observe the possible effects of the different data ETL pipelines to which the 2 datasets have been subjected. Hence, the analyses were performed for all patients present in the datasets.</p></sec></sec><sec id="s2-8"><title>Selection of Indicators</title><p>The indicators were selected by a research team with expertise in research conducted with routine health care data, and data processors of the 2 databases. With this set of indicators, we aimed to be representative of the research that is typically conducted with these datasets [<xref ref-type="bibr" rid="ref3">3</xref>,<xref ref-type="bibr" rid="ref28">28</xref>,<xref ref-type="bibr" rid="ref29">29</xref>,<xref ref-type="bibr" rid="ref34">34</xref>]. When selecting the set of indicators, the selection process focused on including the diagnosis of a chronic or long-term condition, an acute condition, and a symptom with a high prevalence within Dutch general practices and a high disease burden. Additionally, we were careful to ensure that the selection of indicators used all available data provided by the databases, to ensure that any relevant potential differences present in the data were detected in the outcomes. We included patients with diabetes mellitus (DM), urinary tract infection (UTI), and cough diagnosis. For each of these diagnosis groups, we calculated the total number of patients by including those with a relevant ICPC code recorded in the EHR: T90 for DM, U71 for UTI, and R05 for cough. The prevalence rate per 1000 patient years was calculated by dividing the number of patients with one of these ICPC diagnosis codes recorded by the number of patient years of the population, then multiplying by 1000. Additionally, we calculated the number of regular consultations and visits these patients received per 1000 patient years, and the number of prescriptions per 1000 patient years. This was done by selecting a group of Anatomical Therapeutic Chemical codes for each diagnosis, as indicated by the pharmacotherapeutic compass, a Dutch web-based reference book that provides independent pharmaceutical information for medical professionals: A10A and A10B for DM; G03C, J01C, J01D, J01E, J01G, J01M, and J01X for UTI; and R05C, R05D, R05X, and R06A for cough [<xref ref-type="bibr" rid="ref35">35</xref>].</p><p>For each diagnosis group, the differences in the indicators were compared using the SD, meaning the proportion of patients based on the AHON registry was compared with the proportion of people based on Nivel-PCD:</p><p><inline-formula><mml:math id="ieqn1"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mrow><mml:mi>S</mml:mi><mml:mi>D</mml:mi></mml:mrow><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:msub><mml:mrow><mml:mover><mml:mi>p</mml:mi><mml:mo stretchy="false">^</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mtext>AHON</mml:mtext></mml:mrow></mml:msub><mml:mo>&#x2212;</mml:mo><mml:msub><mml:mrow><mml:mover><mml:mi>p</mml:mi><mml:mo stretchy="false">^</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mtext>NPCD</mml:mtext></mml:mrow></mml:msub></mml:mrow><mml:msqrt><mml:mfrac><mml:mrow><mml:msub><mml:mrow><mml:mover><mml:mi>p</mml:mi><mml:mo stretchy="false">^</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mtext>AHON</mml:mtext></mml:mrow></mml:msub><mml:mo stretchy="false">(</mml:mo><mml:mn>1</mml:mn><mml:mo>&#x2212;</mml:mo><mml:msub><mml:mrow><mml:mover><mml:mi>p</mml:mi><mml:mo stretchy="false">^</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mtext>AHON</mml:mtext></mml:mrow></mml:msub><mml:mo stretchy="false">)</mml:mo><mml:mo>+</mml:mo><mml:msub><mml:mrow><mml:mover><mml:mi>p</mml:mi><mml:mo stretchy="false">^</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mtext>NPCD</mml:mtext></mml:mrow></mml:msub><mml:mo stretchy="false">(</mml:mo><mml:mn>1</mml:mn><mml:mo>&#x2212;</mml:mo><mml:msub><mml:mrow><mml:mover><mml:mi>p</mml:mi><mml:mo stretchy="false">^</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mtext>NPCD</mml:mtext></mml:mrow></mml:msub><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mn>2</mml:mn></mml:mfrac></mml:msqrt></mml:mfrac></mml:mrow></mml:mstyle></mml:math></inline-formula></p><p>Where AHON=AHON registry and NPCD=Nivel-PCD. According to Austin [<xref ref-type="bibr" rid="ref36">36</xref>], absolute values of the SD of 0.2, 0.5, and 0.8 correspond to small, medium, and large differences, respectively [<xref ref-type="bibr" rid="ref37">37</xref>]. Remaining cautious to some extent, and in order to disclose small differences, differences with absolute values of the SD&#x003E;0.2 were considered to be significant.</p><p>All analyses were performed using R (version 4.2.3; R Foundation for Statistical Computing) and RStudio (version 2022.02.1+461 &#x201C;Prairie Trillium&#x201D;; R Foundation for Statistical Computing). For definitions of patient years, prevalence rate, and prescriptions, see <xref ref-type="other" rid="box1">Textbox 1</xref>.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>Population and Demographics Characteristics</title><p>This study included all patients who were registered for at least one quarter in 2019 at 1 of 8 general practices represented in both the AHON registry and the Nivel-PCD. This resulted in a total of 49,907 patients, of which 47,517 patients were present in the AHON registry database and 44,247 patients were present in the Nivel-PCD. A subgroup of 41,857 patients present in both datasets, referred to as &#x201C;concordant patients,&#x201D; was identified. The total number of patients in the general practices prior to data extraction from the EHR system was unavailable.</p><p><xref ref-type="table" rid="table1">Table 1</xref> provides an overview of the demographic characteristics of all patients combined. The patient demographics, age category, and sex were similar in both datasets. There was a difference in the total number of unique patients (n=47,517 vs n=44,247), the number of patient years (n=46,400 vs n=43,100), and the mean number of patients per practice (n=5940 vs n=5531). The latter difference was largely influenced by one outlier practice. We found no statistically significant differences in the demographic characteristics between the datasets, with all <italic>P</italic> values &#x003E;0.5.</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Demographic characteristics of the study population (all patients, N=49,907).</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Characteristic</td><td align="left" valign="bottom">AHON<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup> registry (n=47,517)</td><td align="left" valign="bottom">Nivel-PCD<sup><xref ref-type="table-fn" rid="table1fn2">b</xref></sup> (n=44,247)</td></tr></thead><tbody><tr><td align="left" valign="top">Patient years</td><td align="left" valign="top">46,400</td><td align="left" valign="top">43,100</td></tr><tr><td align="left" valign="top">Patient years per patient</td><td align="left" valign="top">0.977</td><td align="left" valign="top">0.974</td></tr><tr><td align="left" valign="top">Patients per practice, mean (SD)</td><td align="left" valign="top">5940 (1313.3)<sup><xref ref-type="table-fn" rid="table1fn3">c</xref></sup></td><td align="left" valign="top">5531 (1803.4)<sup><xref ref-type="table-fn" rid="table1fn3">c</xref></sup></td></tr><tr><td align="left" valign="top">Age category (years), n (%)</td><td align="left" valign="top">&#x2003;</td><td align="left" valign="top">&#x2003;</td></tr><tr><td align="char" char="." valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>0&#x2010;4</td><td align="left" valign="top">2239 (4.7)</td><td align="left" valign="top">2093 (4.7)</td></tr><tr><td align="char" char="." valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>5&#x2010;17</td><td align="left" valign="top">7382 (15.5)</td><td align="left" valign="top">6917 (15.6)</td></tr><tr><td align="char" char="." valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>18&#x2010;64</td><td align="left" valign="top">27,507 (57.9)</td><td align="left" valign="top">25,684 (58.0)</td></tr><tr><td align="char" char="." valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>65+</td><td align="left" valign="top">10,389 (21.9)</td><td align="left" valign="top">9553 (21.6)</td></tr><tr><td align="left" valign="top">Sex, n (%)</td><td align="left" valign="top">&#x2003;</td><td align="left" valign="top">&#x2003;</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Male</td><td align="left" valign="top">23,471 (49.2)</td><td align="left" valign="top">21,968 (49.6)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Female</td><td align="left" valign="top">24,046 (50.6)</td><td align="left" valign="top">22,279 (50.4)</td></tr></tbody></table><table-wrap-foot><fn id="table1fn1"><p><sup>a</sup>AHON: Academic General Practitioner Development Network.</p></fn><fn id="table1fn2"><p><sup>b</sup>Nivel-PCD: Nivel Primary Care Database.</p></fn><fn id="table1fn3"><p><sup>c</sup>Difference in mean number of patients per practice largely due to one outlier.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s3-2"><title>Similarity in Epidemiology of Concordant Patients</title><p>The number of contacts, regular consultations and visits, prescriptions, and episodes per patient were analyzed for the concordant patients in each dataset. By comparing these outcomes, statistically significant differences were obtained between the AHON registry and Nivel-PCD. All differences were significant. In the AHON registry, the mean number of contacts recorded per patient was 8.58 (SD 10.10), while in the Nivel-PCD this average was 7.40 (SD 9.02). The average number of regular consultations and home visits per patient was 4.33 (SD 5.67) in the AHON registry and 4.30 (SD 5.65) in the Nivel-PCD. The number of prescriptions was higher in the AHON registry, with an average of 6.75 (SD 11.30) per patient, compared with 5.90 (SD 9.45) in Nivel-PCD (<italic>P</italic>&#x003C;.001). The number of episodes was significantly lower for the patients in the AHON registry: 1.61 (SD 1.73) episodes per patient in 2019, while patients in the Nivel-PCD had a mean number of 3.74 (SD 3.67) episodes in 2019 (<italic>P</italic>&#x003C;.001). <xref ref-type="table" rid="table2">Table 2</xref> provides detailed outcomes for all indicators of similarity in the concordant patient group.</p><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Epidemiology of the concordant study population.</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Outcome</td><td align="left" valign="bottom">AHON<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup> registry (n=41.857), mean (SD)</td><td align="left" valign="bottom">Nivel-PCD<sup><xref ref-type="table-fn" rid="table2fn2">b</xref></sup> (n=41.857), mean (SD)</td><td align="left" valign="bottom" colspan="2">Statistical differences between AHON registry and Nivel-PCD</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"><italic>P</italic> value</td><td align="left" valign="bottom">95% CI<sup><xref ref-type="table-fn" rid="table2fn3">c</xref></sup></td></tr></thead><tbody><tr><td align="left" valign="top">Number of contacts per patient</td><td align="left" valign="top">8.58 (10.10)</td><td align="left" valign="top">7.40 (9.02)</td><td align="left" valign="top">&#x003C;.001</td><td align="char" char="." valign="top">1.17 to 0.01</td></tr><tr><td align="left" valign="top">Number of regular consultations and visits per patient</td><td align="left" valign="top">4.33 (5.67)</td><td align="left" valign="top">4.30 (5.65)</td><td align="left" valign="top">.46</td><td align="char" char="." valign="top">&#x2013;0.07 to 0.13</td></tr><tr><td align="left" valign="top">Number of prescriptions per patient</td><td align="left" valign="top">6.75 (11.30)</td><td align="left" valign="top">5.90 (9.45)</td><td align="left" valign="top">&#x003C;.001</td><td align="char" char="." valign="top">0.83 to 0.87</td></tr><tr><td align="left" valign="top">Number of episodes per patient</td><td align="left" valign="top">1.61 (1.73)</td><td align="left" valign="top">3.74 (3.67)</td><td align="left" valign="top">&#x003C;.001</td><td align="char" char="." valign="top">2.11 to 2.15</td></tr></tbody></table><table-wrap-foot><fn id="table2fn1"><p><sup>a</sup>AHON: Academic General Practitioner Development Network.</p></fn><fn id="table2fn2"><p><sup>b</sup>Nivel-PCD: Nivel Primary Care Database.</p></fn><fn id="table2fn3"><p><sup>c</sup>95% CI of the mean difference.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s3-3"><title>Analyses on Health Service Use in 3 Diagnosis Groups</title><p>In step 3, we analyzed the prevalence rate, number of prescriptions, and the number of regular consultations and visits per 1000 patient years for 3 different diagnosis groups: DM, UTI, and cough. The differences varied greatly, but none were deemed significant. There were no significant differences in the prevalence rates for the DM (SD &#x2212;0.01), UTI (SD &#x2212;0.10), and cough (SD 0.19) diagnosis groups, as shown in <xref ref-type="table" rid="table3">Table 3</xref>. The SD between the number of prescriptions per 1000 patient years was 0.12 for the DM diagnosis group, 0.01 for the UTI diagnosis group, and &#x2212;0.001 for the cough diagnosis group. The number of regular consultations and visits did not significantly differ across the 3 diagnosis groups either, with SDs of &#x2212;0.05 for the DM group, &#x2212;0.19 for the UTI group, and &#x2212;0.18 for the cough group. All SDs are presented in <xref ref-type="table" rid="table3">Table 3</xref>.</p><table-wrap id="t3" position="float"><label>Table 3.</label><caption><p>Indicator outcomes for 3 diagnoses.</p></caption><table id="table3" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Indicator</td><td align="left" valign="bottom">AHON<sup><xref ref-type="table-fn" rid="table3fn1">a</xref></sup> registry (n=47,517) (patient years=46,400)</td><td align="left" valign="bottom">Nivel-PCD<sup><xref ref-type="table-fn" rid="table3fn2">b</xref></sup> (n=44,247) (patient years=43,100)</td><td align="left" valign="bottom">SD between AHON registry and Nivel-PCD</td></tr></thead><tbody><tr><td align="left" valign="top">Diabetes mellitus (T90) diagnosis</td><td align="left" valign="top">&#x2003;</td><td align="left" valign="top">&#x2003;</td><td align="left" valign="top">&#x2003;</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Number of patients with ICPC<sup><xref ref-type="table-fn" rid="table3fn3">c</xref></sup> T90 record</td><td align="left" valign="top">2979</td><td align="left" valign="top">2787</td><td align="left" valign="top">N/A<sup><xref ref-type="table-fn" rid="table3fn4">d</xref></sup></td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Patient years</td><td align="left" valign="top">2936</td><td align="left" valign="top">2728</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Patient years per patient</td><td align="left" valign="top">0.986</td><td align="left" valign="top">0.979</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Prevalence rate</td><td align="left" valign="top">64.2</td><td align="left" valign="top">64.7</td><td align="left" valign="top">&#x2212;0.01</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Number of prescriptions per 1000 patient years</td><td align="left" valign="top">447.0</td><td align="left" valign="top">377.4</td><td align="left" valign="top">0.12</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Number of regular consultations and visits per 1000 patient years</td><td align="left" valign="top">498.1</td><td align="left" valign="top">529.6</td><td align="left" valign="top">&#x2212;0.05</td></tr><tr><td align="left" valign="top">Urinary tract infection (U71) diagnosis</td><td align="left" valign="top">&#x2003;</td><td align="left" valign="top">&#x2003;</td><td align="left" valign="top">&#x2003;</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Number of patients with ICPC U71 record</td><td align="left" valign="top">2960</td><td align="left" valign="top">3005</td><td align="left" valign="top">N/A</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Patient years</td><td align="left" valign="top">2927.25</td><td align="left" valign="top">2961.25</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Patient years per patient</td><td align="left" valign="top">0.977</td><td align="left" valign="top">0.977</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Prevalence rate</td><td align="left" valign="top">63.8</td><td align="left" valign="top">69.7</td><td align="left" valign="top">&#x2212;0.10</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Number of prescriptions per 1000 patient years</td><td align="left" valign="top">138.1</td><td align="left" valign="top">135.0</td><td align="left" valign="top">0.01</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Number of regular consultations and visits per 1000 patient years</td><td align="left" valign="top">722.1</td><td align="left" valign="top">823.3</td><td align="left" valign="top">&#x2212;0.19</td></tr><tr><td align="left" valign="top">Cough (R05) diagnosis</td><td align="left" valign="top">&#x2003;</td><td align="left" valign="top">&#x2003;</td><td align="left" valign="top">&#x2003;</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Number of patients with ICPC R05 record (patient years, patient years per patient)</td><td align="left" valign="top">2404</td><td align="left" valign="top">2718</td><td align="left" valign="top">N/A</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Patient years</td><td align="left" valign="top">2356.5</td><td align="left" valign="top">2663.25</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Patient years per patient</td><td align="left" valign="top">0.980</td><td align="left" valign="top">0.980</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Prevalence rate</td><td align="left" valign="top">51.8</td><td align="left" valign="top">63.1</td><td align="left" valign="top">&#x2212;0.19</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Number of prescriptions per 1000 patient years</td><td align="left" valign="top">50.8</td><td align="left" valign="top">51.0</td><td align="left" valign="top">&#x2212;0.001</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Number of regular consultations and visits per 1000 patient years</td><td align="left" valign="top">426.4</td><td align="left" valign="top">536.7</td><td align="left" valign="top">&#x2212;0.18</td></tr></tbody></table><table-wrap-foot><fn id="table3fn1"><p><sup>a</sup>AHON: Academic General Practitioner Development Network.</p></fn><fn id="table3fn2"><p><sup>b</sup>Nivel-PCD: Nivel Primary Care Database.</p></fn><fn id="table3fn3"><p><sup>c</sup>ICPC: International Classification of Primary Care.</p></fn><fn id="table3fn4"><p><sup>d</sup>N/A: not applicable.</p></fn></table-wrap-foot></table-wrap></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Findings</title><p>This study investigated the influence of data extraction and processing on research outcomes derived from routine health care data by comparing indicators of similarity based on EHR data from 8 general practices processed through 2 different ETL methods. Despite the identical origin of the data, differences in data extraction and processing pipelines, as well as the choices made by several actors (eg, data processors and researchers), during different stages of these processing methods by each database, resulted in different indicator outcomes. Our results show more substantial differences when data has been more extensively processed, and no significant differences when processing was minimized.</p><p>The value of EHR data is increasingly recognized, along with the acknowledgment of the need for reliable and valid data. However, our findings emphasize the need for transparency regarding the data governance and the motives behind the ETL steps, as well as adequate metadata to document these decisions [<xref ref-type="bibr" rid="ref38">38</xref>,<xref ref-type="bibr" rid="ref39">39</xref>]. For example, these choices often originate from the inclination to improve the validity of the outcomes. Moreover, interoperability challenges extend beyond uniform EHR systems or standardized coding or ontologies [<xref ref-type="bibr" rid="ref38">38</xref>,<xref ref-type="bibr" rid="ref40">40</xref>].</p><p>The results support the expectation that ETL choices can significantly affect research outcomes. This underscores the relevance of adopting transparency in the approach to obtaining, processing, analyzing, but moreover interpreting the data, when aiming for appropriate quality. Additionally, it shows the complexity of data processing and the coordination of certain definitions of variables between data processors and researchers. Furthermore, researchers conducting future studies with EHR data should be mindful of data processing choices made, and data processors should share their knowledge about these choices. Additionally, users of EHR data, such as researchers and policy makers, should invest in their knowledge of metadata, as transparency is becoming increasingly critical in the context of developments such as the EHDS.</p></sec><sec id="s4-2"><title>Principal Results</title><p>First, we compared the demographic characteristics of all patients in the study population and noted differences in the number of unique patients, patient years, and the mean number of patients per practice. This may be explained by the different pseudonymization methods, as the AHON registry uses the PC3 postal code, year of birth, sex of the patient, and the Nivel-PCD uses the pseudonym of the social security number of the patient. After pseudonymization, patient data were uploaded, stored, and combined on the data platform of Statistics Netherlands. Previous research shows that similar procedures result in a loss of coverage of linked patients [<xref ref-type="bibr" rid="ref41">41</xref>]. The difference in patient years may be caused by the data processing of the registration quarters in each database, as each databases&#x2019; patient years are based on differentiating dates of registration quarters, and the occurrence of imputation of registration quarters takes place for Nivel-PCD only.</p><p>Second, we compared the epidemiology of the concordant patient group and found significant differences between the AHON registry and Nivel-PCD in the average number of contacts, prescriptions, episodes, and regular consultations and visits per patient. This implies that differences occur on the level of the datasets as a whole, as seen from the demographic analysis results, as well as for the exact concordant patients. We conclude these differences occur due to different data ETL methods, and additionally, due to the impact of choices on the analyses. For example, a specific selection of insurance claims codes for regular consultations and visits, although significantly different, results in a similar mean number per patient, implying that thorough coordination of variables may mitigate the effects of different data processing methods. Nivel-PCD and AHON registries apply different exclusion steps for insurance claims codes in preparation for a research dataset, resulting in a large difference in the total number of insurance claims codes and the type of codes included. The subselection of these claims codes for regular consultations and visits was specifically coordinated, resulting in no significant difference in the number of regular consultations and visits between the 2 databases. When identical definitions are used for regular consultations and visits, and minimal processing steps have taken place on the variables used, no significant differences are observed between the 2 databases. This highlights the importance of clear variables and ETL specification when analyzing data [<xref ref-type="bibr" rid="ref39">39</xref>]. Interpretation differences can occur when insurance claims codes are analyzed in general. As data are increasingly being shared, for example between countries, researchers with limited knowledge of certain health care systems or data processing methods can unintentionally present biased results. Meta-information on data extraction and processing choices, as well as research methods, could be a solution. The difference in number of episodes may be attributed to the Nivel-PCD episode construct. The Nivel-PCD episode construct algorithm takes all episodes recorded by GPs into account, as well as contacts with the GP and prescriptions as recorded in the EHR. Additionally, this construct includes episodes that were started at the end of the previous year and continue into the next year [<xref ref-type="bibr" rid="ref33">33</xref>]. For AHON registry diagnoses based on the episode ICPC, these episodes are not included. The episode construct, hence, possibly increases the number of episodes compared with the number of episodes based on general practice records. This methodological distinction aligns with the results of this study.</p><p>Lastly, we investigated the effects of the different data extraction and processing methods on a selection of indicator outcomes, comparable to real-world research conducted with these datasets, by analyzing the health service use in 3 diagnosis groups. For these indicators, we found no SDs that were deemed significant. The prevalence rates for the UTI diagnosis group and the cough diagnosis group showed greater differences, while the prevalence rates for the DM diagnosis group did not. This may be attributed to the processing that takes place for ICPC codes within the contacts table, which includes the ICPC codes of patients who have visited the GP for this specific diagnosis. Patients with chronic illnesses may visit the GP more frequently for their diagnosis compared with patients with an acute illness, potentially diminishing the effect of ETL differences, as the maximum number of disease cases for the prevalence rate was one per patient. Additionally, differences may be attributed to the differences in the processing of patient years and the pseudonymization process.</p><p>The variance in the SD among prevalence rates and the additional indicators, along with the discrepancy of significant differences for the concordant patient group and no significant differences for the 3 diagnosis groups, shows that the severity of the differences may range from irrelevant to significant. This may be conditional on the purpose of the use of these data. In this study, the results of the indicators are dependent on the dataset that is used to answer the research questions, as well as the method to analyze these data. The interpretation of these outcomes is relevant because research outcomes are often used for purposes such as policy making and feedback information to health care professionals, and the approach of interpreting research outcomes when handling imperfect data is thus consequential. Third parties using EHR data for secondary uses should therefore not dismiss the value EHR data has to offer, such as the broadness of research outcomes available, as demonstrated in this study, but rather focus on improving the manner in which these data are handled. The criticality of the interpretation of research outcomes may be different for research outcomes based on trends over time. In other words, when the data extraction and processing methods remain identical for several datasets over the years, and data processors and researchers focus on data robustness, outcome measures on trends over time might remain reliable. This should be explored in future research.</p></sec><sec id="s4-3"><title>Comparison With Prior Work</title><p>The observed differences in outcomes of the indicators across all 3 steps highlight the necessity of transparency and joint decision-making with the knowledge of researchers on the dataset that is being used. Instructions on the fitness for purpose and the data quality can be included in the documentation, and clear communication between data processors and researchers is crucial for the interpretation of researchers and policy makers on the results of their study. The outcomes of this study suggest that frameworks to improve fitness for purpose could be a necessary tool in analyzing and interpreting the data. Previous research has resulted in a data quality assessment framework to improve the quality of the datasets that are used for secondary purposes such as research, but it does not elaborate on the effects that processing steps can have on research outcomes [<xref ref-type="bibr" rid="ref42">42</xref>].</p><p>Differences in outcomes that occur due to data processing emphasize the need to make joint decisions regarding ETL pipelines, as this may increase interoperability, for example, between research databases. To achieve this, documentation regarding this process is essential, and the need for detailed meta-information is crucial in this type of research. Interoperability also requires collaboration within and between data processors and researchers [<xref ref-type="bibr" rid="ref43">43</xref>]. This cooperation will lead to better interpretations of the research conducted with these types of data, and previous research concludes there are benefits to be gained from research on optimal common standards [<xref ref-type="bibr" rid="ref44">44</xref>]. Similar common approaches have been recommended to improve data quality and have resulted in a harmonized data quality assessment framework [<xref ref-type="bibr" rid="ref45">45</xref>]. To stimulate interoperability and increase data quality [<xref ref-type="bibr" rid="ref46">46</xref>], frameworks such as these should become common practice before analyzing EHR data.</p></sec><sec id="s4-4"><title>Limitations</title><p>A limitation of this study is the choice of diagnoses (ie, DM, UTI, and cough) with high prevalence rates, possibly making the indicator outcomes less applicable to smaller diagnostic groups, for example, of rare conditions [<xref ref-type="bibr" rid="ref47">47</xref>]. These diagnoses were selected to ensure sufficient patient numbers per diagnosis as the number of concordant general practices in the databases was small. An additional limitation is the lack of detailed information on the data ETL steps that were taken for each database. This was due to the fact that this study started with the end products, that is, the research datasets, as opposed to the unprocessed data coming directly from the EHR systems. Despite the limitations, this data was prepared for research, irrespective of this study, which decreases bias in the preparation methods of the extraction and data processing for these research datasets. Moreover, this study appears to be the first to compare data processing methods with concordant general practices and hence contributes to gaining insight into the influence of these methods on research outcomes based on EHR data.</p></sec><sec id="s4-5"><title>Conclusions</title><p>In conclusion, routine health care data such as EHR data from general practices offer a broad spectrum of applications, and the secondary use of these data is ever-increasing. Moreover, the results show the impact of data processing steps and analysis choices on the selected indicators and the necessity of transparency between the knowledge of data processors regarding these choices and the knowledge of researchers of this type of metadata. Researchers and policy makers should be cautious with the secondary use of EHR data, especially with regard to the interpretation of research outcomes. Future research should focus on this transparency and the benefits of using a data quality framework intended to minimize the effects of data processing steps, and on gaining more insight into the individual influence of different processing steps on different research outcomes. This could stimulate a common approach among data processors and researchers and thus increase interoperability, which is all the more important with regard to developments such as EHDS and the ever-increasing secondary use of routinely recorded health data.</p></sec></sec></body><back><ack><p>This research has been conducted using the Academic General Practitioner Development Network (AHON) registry and Nivel Primary Care Database (Nivel-PCD). The AHON registry and Nivel-PCD are ongoing longitudinal databases that aim to give insight into medical care provided in general practices in the North of the Netherlands and the Netherlands, respectively. Starting in 1998 and 1970, respectively, pseudonymized patient records have been added to the AHON registry and Nivel-PCD multiple times a year from a growing number of participating general practices. Patients are informed by their general practices via folders, posters, and websites of their participation in the AHON registry and Nivel-PCD. Individual patients can fill out an opt-out form in order to not have their data recorded in the AHON registry or Nivel-PCD. This study was performed as part of the following projects: Long COVID MM project (10430302110004), the &#x201C;General Practice Research Infrastructure Pandemic Preparedness Program&#x201D; (GRIP3) (10430112110001), and the study: &#x201C;Changes in the Use and Organization of Care in General Practices and Out-of-hours Services: Lessons Learned from the COVID-19 Pandemic&#x201D; (10430022010006), which were financed by the Netherlands Organization for Health Research and Development (ZonMW). The funder played no role in the study design, data collection, data analysis and interpretation, or writing of this manuscript. We used the generative artificial intelligence tool ChatGPT by OpenAI [<xref ref-type="bibr" rid="ref48">48</xref>] in the preparation of this manuscript as a grammar and text editing tool. All sections of the manuscript were further reviewed and revised by the study group.</p></ack><notes><sec><title>Data Availability</title><p>The datasets generated or analyzed during this study are not publicly available due to the regulations for sensitive health data in Article 9 of the General Data Protection Regulation (GDPR) of the European Union but are available from the corresponding author upon reasonable request.</p></sec></notes><fn-group><fn fn-type="con"><p>MHJvE, IGA, IB, RAV, and LLP were responsible for the conceptualization of the study. Data curation was performed by YMW and FG. MHJvE and RT conducted the formal analysis. Funding was acquired by IB, RAV, and LLP. The investigation was carried out by MHJvE. The methodology was developed by MHJvE, RT, IGA, IB, and RAV. MHJvE also managed the project. Resources were provided by YMW, FG, LLP, and RAV. Supervision was carried out by IB and RAV. The original draft was written by MHJvE, IB, and RAV. All authors&#x2014;MHJvE, RT, YMW, IGA, FG, LLP, IB, and RAV&#x2014;contributed to the review and editing of the manuscript.</p></fn><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">AHON</term><def><p>Academic General Practitioner Development Network (Academische Huisartsen Ontwikkel Netwerk)</p></def></def-item><def-item><term id="abb2">DM</term><def><p>diabetes mellitus</p></def></def-item><def-item><term id="abb3">EHDS</term><def><p>European Health Data Space</p></def></def-item><def-item><term id="abb4">EHR</term><def><p>electronic health record</p></def></def-item><def-item><term id="abb5">ETL</term><def><p>extraction, transformation, and loading</p></def></def-item><def-item><term id="abb6">GP</term><def><p>general practitioner</p></def></def-item><def-item><term id="abb7">ICPC</term><def><p>International Classification of Primary Care</p></def></def-item><def-item><term id="abb8">Nivel-PCD</term><def><p>Nivel Primary Care Database</p></def></def-item><def-item><term id="abb9">UTI</term><def><p>urinary tract infection</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Verheij</surname><given-names>RA</given-names> </name><name name-style="western"><surname>Curcin</surname><given-names>V</given-names> </name><name name-style="western"><surname>Delaney</surname><given-names>BC</given-names> </name><name name-style="western"><surname>McGilchrist</surname><given-names>MM</given-names> </name></person-group><article-title>Possible sources of bias in primary care electronic health record data use and reuse</article-title><source>J Med Internet Res</source><year>2018</year><month>05</month><day>29</day><volume>20</volume><issue>5</issue><fpage>e185</fpage><pub-id pub-id-type="doi">10.2196/jmir.9134</pub-id><pub-id pub-id-type="medline">29844010</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ramerman</surname><given-names>L</given-names> </name><name name-style="western"><surname>Rijpkema</surname><given-names>C</given-names> </name><name name-style="western"><surname>Bos</surname><given-names>N</given-names> </name><name name-style="western"><surname>Flinterman</surname><given-names>LE</given-names> </name><name name-style="western"><surname>Verheij</surname><given-names>RA</given-names> </name></person-group><article-title>The use of out-of-hours primary care during the first year of the COVID-19 pandemic</article-title><source>BMC Health Serv Res</source><year>2022</year><month>05</month><day>21</day><volume>22</volume><issue>1</issue><fpage>679</fpage><pub-id pub-id-type="doi">10.1186/s12913-022-08096-x</pub-id><pub-id pub-id-type="medline">35597939</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Rijpkema</surname><given-names>C</given-names> </name><name name-style="western"><surname>Ramerman</surname><given-names>L</given-names> </name><name name-style="western"><surname>Homburg</surname><given-names>M</given-names> </name><etal/></person-group><article-title>Care by general practitioners for patients with asthma or COPD during the COVID-19 pandemic</article-title><source>NPJ Prim Care Respir Med</source><year>2023</year><month>04</month><day>8</day><volume>33</volume><issue>1</issue><fpage>15</fpage><pub-id pub-id-type="doi">10.1038/s41533-023-00340-z</pub-id><pub-id pub-id-type="medline">37031214</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Grath-Lone</surname><given-names>LM</given-names> </name><name name-style="western"><surname>Jay</surname><given-names>MA</given-names> </name><name name-style="western"><surname>Blackburn</surname><given-names>R</given-names> </name><etal/></person-group><article-title>What makes administrative data &#x201C;research-ready&#x201D;? A systematic review and thematic analysis of published literature</article-title><source>Int J Popul Data Sci</source><year>2022</year><volume>7</volume><issue>1</issue><fpage>1718</fpage><pub-id pub-id-type="doi">10.23889/ijpds.v6i1.1718</pub-id><pub-id pub-id-type="medline">35520099</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Arslan</surname><given-names>IG</given-names> </name><name name-style="western"><surname>Damen</surname><given-names>J</given-names> </name><name name-style="western"><surname>de Wilde</surname><given-names>M</given-names> </name><etal/></person-group><article-title>Incidence and prevalence of knee osteoarthritis using codified and narrative data from electronic health records: a population-based study</article-title><source>Arthritis Care Res (Hoboken)</source><year>2022</year><month>06</month><volume>74</volume><issue>6</issue><fpage>937</fpage><lpage>944</lpage><pub-id pub-id-type="doi">10.1002/acr.24861</pub-id><pub-id pub-id-type="medline">35040591</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Viol&#x00E1;n</surname><given-names>C</given-names> </name><name name-style="western"><surname>Foguet-Boreu</surname><given-names>Q</given-names> </name><name name-style="western"><surname>Hermosilla-P&#x00E9;rez</surname><given-names>E</given-names> </name><etal/></person-group><article-title>Comparison of the information provided by electronic health records data and a population health survey to estimate prevalence of selected health conditions and multimorbidity</article-title><source>BMC Public Health</source><year>2013</year><month>03</month><day>21</day><volume>13</volume><fpage>1</fpage><lpage>10</lpage><pub-id pub-id-type="doi">10.1186/1471-2458-13-251</pub-id><pub-id pub-id-type="medline">23517342</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="web"><article-title>How we collect data</article-title><source>Institute for Health Metrics and Evaluation</source><year>2024</year><access-date>2024-06-20</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.healthdata.org/data-tools-practices/data-collection">https://www.healthdata.org/data-tools-practices/data-collection</ext-link></comment></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="web"><article-title>Health service data</article-title><source>World Health Organization</source><year>2024</year><access-date>2024-06-20</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.who.int/data/data-collection-tools/health-service-data">https://www.who.int/data/data-collection-tools/health-service-data</ext-link></comment></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="web"><person-group person-group-type="author"><name name-style="western"><surname>Poos</surname><given-names>M</given-names> </name><name name-style="western"><surname>Gommer</surname><given-names>A</given-names> </name><name name-style="western"><surname>Nielen</surname><given-names>M</given-names> </name></person-group><article-title>Gebruik huisartsenregistraties voor schattingen morbiditeit [use of GP registrations for morbidity estimates]</article-title><source>VZinfo website</source><year>2024</year><access-date>2024-06-20</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.vzinfo.nl/bronnen-methoden-en-achtergronden/huisartsen-registraties-schattingen-morbiditeit#:~:text=Op%20VZinfo.nl%20zijn%20schattingen,prevalentie)%2C%20absoluut%20of%20relatief">https://www.vzinfo.nl/bronnen-methoden-en-achtergronden/huisartsen-registraties-schattingen-morbiditeit#:~:text=Op%20VZinfo.nl%20zijn%20schattingen,prevalentie)%2C%20absoluut%20of%20relatief</ext-link></comment></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Vanhommerig</surname><given-names>JW</given-names> </name><name name-style="western"><surname>Verheij</surname><given-names>RA</given-names> </name><name name-style="western"><surname>Hek</surname><given-names>K</given-names> </name><etal/></person-group><article-title>Data resource profile: nivel primary care database (Nivel-PCD), the Netherlands</article-title><source>Int J Epidemiol</source><year>2025</year><month>02</month><day>16</day><volume>54</volume><issue>2</issue><fpage>dyaf017</fpage><pub-id pub-id-type="doi">10.1093/ije/dyaf017</pub-id><pub-id pub-id-type="medline">40083190</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Madhavan</surname><given-names>S</given-names> </name><name name-style="western"><surname>Bastarache</surname><given-names>L</given-names> </name><name name-style="western"><surname>Brown</surname><given-names>JS</given-names> </name><etal/></person-group><article-title>Use of electronic health records to support a public health response to the COVID-19 pandemic in the United States: a perspective from 15 academic medical centers</article-title><source>J Am Med Inform Assoc</source><year>2021</year><month>02</month><day>15</day><volume>28</volume><issue>2</issue><fpage>393</fpage><lpage>401</lpage><pub-id pub-id-type="doi">10.1093/jamia/ocaa287</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Horrocks</surname><given-names>S</given-names> </name><name name-style="western"><surname>Wilkinson</surname><given-names>T</given-names> </name><name name-style="western"><surname>Schnier</surname><given-names>C</given-names> </name><etal/></person-group><article-title>Accuracy of routinely-collected healthcare data for identifying motor neurone disease cases: a systematic review</article-title><source>PLoS ONE</source><year>2017</year><volume>12</volume><issue>2</issue><fpage>e0172639</fpage><pub-id pub-id-type="doi">10.1371/journal.pone.0172639</pub-id><pub-id pub-id-type="medline">28245254</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Dash</surname><given-names>S</given-names> </name><name name-style="western"><surname>Shakyawar</surname><given-names>SK</given-names> </name><name name-style="western"><surname>Sharma</surname><given-names>M</given-names> </name><name name-style="western"><surname>Kaushik</surname><given-names>S</given-names> </name></person-group><article-title>Big data in healthcare: management, analysis and future prospects</article-title><source>J Big Data</source><year>2019</year><month>12</month><volume>6</volume><issue>1</issue><fpage>54</fpage><pub-id pub-id-type="doi">10.1186/s40537-019-0217-0</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="web"><article-title>What is the European Health Data Space (EHDS)?</article-title><source>The European Health Data Space (EHDS)</source><year>2024</year><access-date>2024-06-20</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.european-health-data-space.com/">https://www.european-health-data-space.com/</ext-link></comment></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="report"><person-group person-group-type="author"><name name-style="western"><surname>Marcus</surname><given-names>JS</given-names> </name><name name-style="western"><surname>Martens</surname><given-names>B</given-names> </name><name name-style="western"><surname>Carugati</surname><given-names>C</given-names> </name><name name-style="western"><surname>Bucher</surname><given-names>A</given-names> </name><name name-style="western"><surname>Godlovitch</surname><given-names>I</given-names> </name></person-group><article-title>The European Health Data Space</article-title><year>2022</year><publisher-name>IPOL | Policy Department for Economic, Scientific and Quality of Life Policies, European Parliament Policy Department studies, 2022</publisher-name><pub-id pub-id-type="doi">10.2139/ssrn.4300393</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Harper</surname><given-names>C</given-names> </name><name name-style="western"><surname>Mafham</surname><given-names>M</given-names> </name><name name-style="western"><surname>Herrington</surname><given-names>W</given-names> </name><etal/></person-group><article-title>Comparison of the accuracy and completeness of records of serious vascular events in routinely collected data vs clinical trial-adjudicated direct follow-up data in the UK: secondary analysis of the ASCEND randomized clinical trial</article-title><source>JAMA Netw Open</source><year>2021</year><month>12</month><day>1</day><volume>4</volume><issue>12</issue><fpage>e2139748</fpage><pub-id pub-id-type="doi">10.1001/jamanetworkopen.2021.39748</pub-id><pub-id pub-id-type="medline">34962561</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ta</surname><given-names>CN</given-names> </name><name name-style="western"><surname>Dumontier</surname><given-names>M</given-names> </name><name name-style="western"><surname>Hripcsak</surname><given-names>G</given-names> </name><name name-style="western"><surname>Tatonetti</surname><given-names>NP</given-names> </name><name name-style="western"><surname>Weng</surname><given-names>C</given-names> </name></person-group><article-title>Columbia open health data, clinical concept prevalence and co-occurrence from electronic health records</article-title><source>Sci Data</source><year>2018</year><month>11</month><day>27</day><volume>5</volume><issue>1</issue><fpage>180273</fpage><pub-id pub-id-type="doi">10.1038/sdata.2018.273</pub-id><pub-id pub-id-type="medline">30480666</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Barbazza</surname><given-names>E</given-names> </name><name name-style="western"><surname>Klazinga</surname><given-names>NS</given-names> </name><name name-style="western"><surname>Kringos</surname><given-names>DS</given-names> </name></person-group><article-title>Exploring the actionability of healthcare performance indicators for quality of care: a qualitative analysis of the literature, expert opinion and user experience</article-title><source>BMJ Qual Saf</source><year>2021</year><month>12</month><volume>30</volume><issue>12</issue><fpage>1010</fpage><lpage>1020</lpage><pub-id pub-id-type="doi">10.1136/bmjqs-2020-011247</pub-id><pub-id pub-id-type="medline">33963072</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>M&#x00E5;nsson</surname><given-names>J</given-names> </name><name name-style="western"><surname>Nilsson</surname><given-names>G</given-names> </name><name name-style="western"><surname>Bj&#x00F6;rkelund</surname><given-names>C</given-names> </name><name name-style="western"><surname>Strender</surname><given-names>LE</given-names> </name></person-group><article-title>Collection and retrieval of structured clinical data from electronic patient records in general practice. a first-phase study to create a health care database for research and quality assessment</article-title><source>Scand J Prim Health Care</source><year>2004</year><month>03</month><volume>22</volume><issue>1</issue><fpage>6</fpage><lpage>10</lpage><pub-id pub-id-type="doi">10.1080/02813430310003660</pub-id><pub-id pub-id-type="medline">15119513</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Arslan</surname><given-names>IG</given-names> </name><name name-style="western"><surname>Damen</surname><given-names>J</given-names> </name><name name-style="western"><surname>de Wilde</surname><given-names>M</given-names> </name><etal/></person-group><article-title>Estimating incidence and prevalence of hip osteoarthritis using electronic health records: a population-based cohort study</article-title><source>Osteoarthr Cartil</source><year>2022</year><month>06</month><volume>30</volume><issue>6</issue><fpage>843</fpage><lpage>851</lpage><pub-id pub-id-type="doi">10.1016/j.joca.2022.03.001</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>van den Dungen</surname><given-names>C</given-names> </name><name name-style="western"><surname>Hoeymans</surname><given-names>N</given-names> </name><name name-style="western"><surname>van den Akker</surname><given-names>M</given-names> </name><etal/></person-group><article-title>Do practice characteristics explain differences in morbidity estimates between electronic health record based general practice registration networks?</article-title><source>BMC Fam Pract</source><year>2014</year><month>10</month><day>30</day><volume>15</volume><issue>1</issue><fpage>176</fpage><pub-id pub-id-type="doi">10.1186/s12875-014-0176-7</pub-id><pub-id pub-id-type="medline">25358247</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Haneuse</surname><given-names>S</given-names> </name><name name-style="western"><surname>Daniels</surname><given-names>M</given-names> </name></person-group><article-title>A general framework for considering selection bias in EHR-based studies: what data are observed and why?</article-title><source>EGEMS (Wash DC)</source><year>2016</year><volume>4</volume><issue>1</issue><fpage>1203</fpage><pub-id pub-id-type="doi">10.13063/2327-9214.1203</pub-id><pub-id pub-id-type="medline">27668265</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Grobbee</surname><given-names>DE</given-names> </name><name name-style="western"><surname>Hoes</surname><given-names>AW</given-names> </name><name name-style="western"><surname>Verheij</surname><given-names>TJM</given-names> </name><name name-style="western"><surname>Schrijvers</surname><given-names>AJP</given-names> </name><name name-style="western"><surname>van Ameijden</surname><given-names>EJC</given-names> </name><name name-style="western"><surname>Numans</surname><given-names>ME</given-names> </name></person-group><article-title>The Utrecht health project: optimization of routine healthcare data for research</article-title><source>Eur J Epidemiol</source><year>2005</year><volume>20</volume><issue>3</issue><fpage>285</fpage><lpage>287</lpage><pub-id pub-id-type="doi">10.1007/s10654-004-5689-2</pub-id><pub-id pub-id-type="medline">15921047</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Homburg</surname><given-names>M</given-names> </name><name name-style="western"><surname>Berger</surname><given-names>M</given-names> </name><name name-style="western"><surname>Berends</surname><given-names>M</given-names> </name><etal/></person-group><article-title>Dutch GP healthcare consumption in COVID-19 heterogeneous regions: an interregional time-series approach in 2020-2021</article-title><source>BJGP Open</source><year>2024</year><month>07</month><volume>8</volume><issue>2</issue><fpage>BJGPO.2023.0121</fpage><pub-id pub-id-type="doi">10.3399/BJGPO.2023.0121</pub-id><pub-id pub-id-type="medline">38128964</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bos</surname><given-names>I</given-names> </name><name name-style="western"><surname>Bosman</surname><given-names>L</given-names> </name><name name-style="western"><surname>van den Hoek</surname><given-names>R</given-names> </name><etal/></person-group><article-title>Comparison of observational methods to identify and characterize post-COVID syndrome in the Netherlands using electronic health records and questionnaires</article-title><source>PLoS ONE</source><year>2025</year><volume>20</volume><issue>1</issue><fpage>e0318272</fpage><pub-id pub-id-type="doi">10.1371/journal.pone.0318272</pub-id><pub-id pub-id-type="medline">39879159</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="report"><person-group person-group-type="author"><name name-style="western"><surname>Blanker</surname><given-names>MH</given-names> </name></person-group><article-title>General practice research infrastructure pandemic preparedness program (GRIP3). COVID-19 2021</article-title><year>2023</year><publisher-name>ZonMW project</publisher-name></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Homburg</surname><given-names>M</given-names> </name><name name-style="western"><surname>Meijer</surname><given-names>E</given-names> </name><name name-style="western"><surname>Berends</surname><given-names>M</given-names> </name><etal/></person-group><article-title>A natural language processing model for COVID-19 detection based on Dutch general practice electronic health records by using bidirectional encoder representations from transformers: development and validation study</article-title><source>J Med Internet Res</source><year>2023</year><month>10</month><day>4</day><volume>25</volume><fpage>e49944</fpage><pub-id pub-id-type="doi">10.2196/49944</pub-id><pub-id pub-id-type="medline">37792444</pub-id></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Twickler</surname><given-names>R</given-names> </name><name name-style="western"><surname>Berger</surname><given-names>MY</given-names> </name><name name-style="western"><surname>Groenhof</surname><given-names>F</given-names> </name><etal/></person-group><article-title>Data resource profile: registry of electronic health records of general practices in the north of the Netherlands (AHON)</article-title><source>Int J Epidemiol</source><year>2024</year><month>02</month><day>14</day><volume>53</volume><issue>2</issue><fpage>dyae021</fpage><pub-id pub-id-type="doi">10.1093/ije/dyae021</pub-id><pub-id pub-id-type="medline">38389286</pub-id></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="web"><article-title>Nivel primary care database</article-title><source>Nivel</source><year>2024</year><access-date>2024-06-20</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.nivel.nl/en/our-databases-and-panels/nivel-primary-care-database">https://www.nivel.nl/en/our-databases-and-panels/nivel-primary-care-database</ext-link></comment></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Soler</surname><given-names>JK</given-names> </name><name name-style="western"><surname>Okkes</surname><given-names>I</given-names> </name><name name-style="western"><surname>Wood</surname><given-names>M</given-names> </name><name name-style="western"><surname>Lamberts</surname><given-names>H</given-names> </name></person-group><article-title>The coming of age of ICPC: celebrating the 21st birthday of the international classification of primary care</article-title><source>Fam Pract</source><year>2008</year><month>08</month><volume>25</volume><issue>4</issue><fpage>312</fpage><lpage>317</lpage><pub-id pub-id-type="doi">10.1093/fampra/cmn028</pub-id><pub-id pub-id-type="medline">18562335</pub-id></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Nahler</surname><given-names>G</given-names> </name><etal/></person-group><article-title>Anatomical therapeutic chemical classification system (ATC)</article-title><source>Dictionary of Pharmaceutical Medicine</source><year>2009</year><publisher-name>Springer</publisher-name><fpage>8</fpage><pub-id pub-id-type="doi">10.1007/978-3-211-89836-9_64</pub-id></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Westerdijk</surname><given-names>M</given-names> </name><name name-style="western"><surname>Zuurbier</surname><given-names>J</given-names> </name><name name-style="western"><surname>Ludwig</surname><given-names>M</given-names> </name><name name-style="western"><surname>Prins</surname><given-names>S</given-names> </name></person-group><article-title>Defining care products to finance health care in the Netherlands</article-title><source>Eur J Health Econ</source><year>2012</year><month>04</month><volume>13</volume><issue>2</issue><fpage>203</fpage><lpage>221</lpage><pub-id pub-id-type="doi">10.1007/s10198-011-0302-6</pub-id><pub-id pub-id-type="medline">21350859</pub-id></nlm-citation></ref><ref id="ref33"><label>33</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Nielen</surname><given-names>MMJ</given-names> </name><name name-style="western"><surname>Spronk</surname><given-names>I</given-names> </name><name name-style="western"><surname>Davids</surname><given-names>R</given-names> </name><etal/></person-group><article-title>Estimating morbidity rates based on routine electronic health records in primary care: observational study</article-title><source>JMIR Med Inform</source><year>2019</year><month>07</month><day>26</day><volume>7</volume><issue>3</issue><fpage>e11929</fpage><pub-id pub-id-type="doi">10.2196/11929</pub-id><pub-id pub-id-type="medline">31350839</pub-id></nlm-citation></ref><ref id="ref34"><label>34</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hek</surname><given-names>K</given-names> </name><name name-style="western"><surname>Ramerman</surname><given-names>L</given-names> </name><name name-style="western"><surname>Weesie</surname><given-names>YM</given-names> </name><etal/></person-group><article-title>Antibiotic prescribing in Dutch daytime and out-of-hours general practice during the COVID-19 pandemic: a retrospective database study</article-title><source>Antibiotics (Basel)</source><year>2022</year><month>02</month><day>25</day><volume>11</volume><issue>3</issue><fpage>309</fpage><pub-id pub-id-type="doi">10.3390/antibiotics11030309</pub-id><pub-id pub-id-type="medline">35326772</pub-id></nlm-citation></ref><ref id="ref35"><label>35</label><nlm-citation citation-type="web"><article-title>Farmacotherapeutisch kompas [pharmacotherapeutic compass]</article-title><source>Zorginstituut Nederland</source><year>2024</year><access-date>2024-06-20</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.farmacotherapeutischkompas.nl/">https://www.farmacotherapeutischkompas.nl/</ext-link></comment></nlm-citation></ref><ref id="ref36"><label>36</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Austin</surname><given-names>PC</given-names> </name></person-group><article-title>Using the standardized difference to compare the prevalence of a binary variable between two groups in observational research</article-title><source>Commun Stat Simul Comput</source><year>2009</year><month>05</month><day>14</day><volume>38</volume><issue>6</issue><fpage>1228</fpage><lpage>1234</lpage><pub-id pub-id-type="doi">10.1080/03610910902859574</pub-id></nlm-citation></ref><ref id="ref37"><label>37</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Overbeek</surname><given-names>JA</given-names> </name><name name-style="western"><surname>Swart</surname><given-names>KMA</given-names> </name><name name-style="western"><surname>Houben</surname><given-names>E</given-names> </name><name name-style="western"><surname>Penning-van Beest</surname><given-names>FJA</given-names> </name><name name-style="western"><surname>Herings</surname><given-names>RMC</given-names> </name></person-group><article-title>Completeness and representativeness of the PHARMO general practitioner (GP) data: a comparison with national statistics</article-title><source>Clin Epidemiol</source><year>2023</year><volume>15</volume><fpage>1</fpage><lpage>11</lpage><pub-id pub-id-type="doi">10.2147/CLEP.S389598</pub-id><pub-id pub-id-type="medline">36636730</pub-id></nlm-citation></ref><ref id="ref38"><label>38</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wilkinson</surname><given-names>MD</given-names> </name><name name-style="western"><surname>Dumontier</surname><given-names>M</given-names> </name><name name-style="western"><surname>Aalbersberg</surname><given-names>IJJ</given-names> </name><etal/></person-group><article-title>The FAIR guiding principles for scientific data management and stewardship</article-title><source>Sci Data</source><year>2016</year><month>03</month><day>15</day><volume>3</volume><issue>1</issue><fpage>160018</fpage><pub-id pub-id-type="doi">10.1038/sdata.2016.18</pub-id><pub-id pub-id-type="medline">26978244</pub-id></nlm-citation></ref><ref id="ref39"><label>39</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Jacobsen</surname><given-names>A</given-names> </name><name name-style="western"><surname>Kaliyaperumal</surname><given-names>R</given-names> </name><name name-style="western"><surname>da Silva Santos</surname><given-names>LOB</given-names> </name><etal/></person-group><article-title>A generic workflow for the data FAIRification process</article-title><source>Data Intell</source><year>2020</year><month>01</month><volume>2</volume><issue>1-2</issue><fpage>56</fpage><lpage>65</lpage><pub-id pub-id-type="doi">10.1162/dint_a_00028</pub-id></nlm-citation></ref><ref id="ref40"><label>40</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Chang</surname><given-names>E</given-names> </name><name name-style="western"><surname>Mostafa</surname><given-names>J</given-names> </name></person-group><article-title>The use of SNOMED CT, 2013-2020: a literature review</article-title><source>J Am Med Inform Assoc</source><year>2021</year><month>08</month><day>13</day><volume>28</volume><issue>9</issue><fpage>2017</fpage><lpage>2026</lpage><pub-id pub-id-type="doi">10.1093/jamia/ocab084</pub-id><pub-id pub-id-type="medline">34151978</pub-id></nlm-citation></ref><ref id="ref41"><label>41</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Heins</surname><given-names>MJ</given-names> </name><name name-style="western"><surname>de Ligt</surname><given-names>KM</given-names> </name><name name-style="western"><surname>Verloop</surname><given-names>J</given-names> </name><name name-style="western"><surname>Siesling</surname><given-names>S</given-names> </name><name name-style="western"><surname>Korevaar</surname><given-names>JC</given-names> </name></person-group><article-title>Opportunities and obstacles in linking large health care registries: the primary secondary cancer care registry - breast cancer</article-title><source>BMC Med Res Methodol</source><year>2022</year><month>04</month><day>27</day><volume>22</volume><issue>1</issue><fpage>124</fpage><pub-id pub-id-type="doi">10.1186/s12874-022-01601-0</pub-id><pub-id pub-id-type="medline">35477392</pub-id></nlm-citation></ref><ref id="ref42"><label>42</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Liaw</surname><given-names>ST</given-names> </name><name name-style="western"><surname>Taggart</surname><given-names>J</given-names> </name><name name-style="western"><surname>Dennis</surname><given-names>S</given-names> </name><name name-style="western"><surname>Yeo</surname><given-names>A</given-names> </name></person-group><article-title>Data quality and fitness for purpose of routinely collected data--a general practice case study from an electronic practice-based research network (ePBRN)</article-title><source>AMIA Annu Symp Proc</source><year>2011</year><volume>2011</volume><fpage>785</fpage><lpage>794</lpage><pub-id pub-id-type="medline">22195136</pub-id></nlm-citation></ref><ref id="ref43"><label>43</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Neiva</surname><given-names>FW</given-names> </name><name name-style="western"><surname>David</surname><given-names>JMN</given-names> </name><name name-style="western"><surname>Braga</surname><given-names>R</given-names> </name><name name-style="western"><surname>Campos</surname><given-names>F</given-names> </name></person-group><article-title>Towards pragmatic interoperability to support collaboration: a systematic review and mapping of the literature</article-title><source>Inf Softw Technol</source><year>2016</year><month>04</month><volume>72</volume><issue>72</issue><fpage>137</fpage><lpage>150</lpage><pub-id pub-id-type="doi">10.1016/j.infsof.2015.12.013</pub-id></nlm-citation></ref><ref id="ref44"><label>44</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Gini</surname><given-names>R</given-names> </name><name name-style="western"><surname>Schuemie</surname><given-names>M</given-names> </name><name name-style="western"><surname>Brown</surname><given-names>J</given-names> </name><etal/></person-group><article-title>Data extraction and management in networks of observational health care databases for scientific research: a comparison of EU-ADR, OMOP, mini-sentinel and MATRICE strategies</article-title><source>EGEMS (Wash DC)</source><year>2016</year><volume>4</volume><issue>1</issue><fpage>1189</fpage><pub-id pub-id-type="doi">10.13063/2327-9214.1189</pub-id><pub-id pub-id-type="medline">27014709</pub-id></nlm-citation></ref><ref id="ref45"><label>45</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kahn</surname><given-names>MG</given-names> </name><name name-style="western"><surname>Callahan</surname><given-names>TJ</given-names> </name><name name-style="western"><surname>Barnard</surname><given-names>J</given-names> </name><etal/></person-group><article-title>A harmonized data quality assessment terminology and framework for the secondary use of electronic health record data</article-title><source>EGEMS (Wash DC)</source><year>2016</year><volume>4</volume><issue>1</issue><fpage>1244</fpage><pub-id pub-id-type="doi">10.13063/2327-9214.1244</pub-id><pub-id pub-id-type="medline">27713905</pub-id></nlm-citation></ref><ref id="ref46"><label>46</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Blacketer</surname><given-names>C</given-names> </name><name name-style="western"><surname>Defalco</surname><given-names>FJ</given-names> </name><name name-style="western"><surname>Ryan</surname><given-names>PB</given-names> </name><name name-style="western"><surname>Rijnbeek</surname><given-names>PR</given-names> </name></person-group><article-title>Increasing trust in real-world evidence through evaluation of observational data quality</article-title><source>J Am Med Inform Assoc</source><year>2021</year><month>09</month><day>18</day><volume>28</volume><issue>10</issue><fpage>2251</fpage><lpage>2257</lpage><pub-id pub-id-type="doi">10.1093/jamia/ocab132</pub-id><pub-id pub-id-type="medline">34313749</pub-id></nlm-citation></ref><ref id="ref47"><label>47</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Dros</surname><given-names>JT</given-names> </name><name name-style="western"><surname>Bos</surname><given-names>I</given-names> </name><name name-style="western"><surname>Bennis</surname><given-names>FC</given-names> </name><etal/></person-group><article-title>Detection of primary Sj&#x00F6;gren&#x2019;s syndrome in primary care: developing a classification model with the use of routine healthcare data and machine learning</article-title><source>BMC Prim Care</source><year>2022</year><month>08</month><day>9</day><volume>23</volume><issue>1</issue><fpage>199</fpage><pub-id pub-id-type="doi">10.1186/s12875-022-01804-w</pub-id><pub-id pub-id-type="medline">35945489</pub-id></nlm-citation></ref><ref id="ref48"><label>48</label><nlm-citation citation-type="web"><article-title>Introducing ChatGPT</article-title><source>OpenAI</source><year>2022</year><access-date>2025-03-04</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://openai.com/blog/chatgpt">https://openai.com/blog/chatgpt</ext-link></comment></nlm-citation></ref></ref-list></back></article>