<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">J Med Internet Res</journal-id><journal-id journal-id-type="publisher-id">jmir</journal-id><journal-id journal-id-type="index">1</journal-id><journal-title>Journal of Medical Internet Research</journal-title><abbrev-journal-title>J Med Internet Res</abbrev-journal-title><issn pub-type="epub">1438-8871</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v28i1e76398</article-id><article-id pub-id-type="doi">10.2196/76398</article-id><article-categories><subj-group subj-group-type="heading"><subject>Viewpoint</subject></subj-group></article-categories><title-group><article-title>Beyond Missingness: Systematizing Methods for Comprehensive Data Fitness Assessment in Clinical Research</article-title></title-group><contrib-group><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Razzaghi</surname><given-names>Hanieh</given-names></name><degrees>MPH, PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Wieand</surname><given-names>Kaleigh</given-names></name><degrees>BSc</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Dickinson</surname><given-names>Kimberley L</given-names></name><degrees>MS</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Kahn</surname><given-names>Michael G</given-names></name><degrees>MD, PhD</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Roy</surname><given-names>Jason</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Blacketer</surname><given-names>Clair</given-names></name><degrees>MPH, PMP</degrees><xref ref-type="aff" rid="aff4">4</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Christakis</surname><given-names>Dimitri A</given-names></name><degrees>MD, MPH</degrees><xref ref-type="aff" rid="aff5">5</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Forrest</surname><given-names>Christopher B</given-names></name><degrees>MD, PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Greenberg</surname><given-names>Jane</given-names></name><degrees>MS, PhD</degrees><xref ref-type="aff" rid="aff6">6</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Lehmann</surname><given-names>Harold P</given-names></name><degrees>MD, PhD</degrees><xref ref-type="aff" rid="aff7">7</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Marsolo</surname><given-names>Keith A</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff8">8</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Sciolla</surname><given-names>Jennifer</given-names></name><degrees>CTRS, CCLS, MS</degrees><xref ref-type="aff" rid="aff9">9</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Weiner</surname><given-names>Mark G</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff10">10</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Weiskopf</surname><given-names>Nicole G</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff11">11</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Bailey</surname><given-names>L Charles</given-names></name><degrees>MD, PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib></contrib-group><aff id="aff1"><institution>Applied Clinical Research Center, Children's Hospital of Philadelphia</institution><addr-line>3401 Civic Center Blvd</addr-line><addr-line>Philadelphia</addr-line><addr-line>PA</addr-line><country>United States</country></aff><aff id="aff2"><institution>Analytics Resource Center, University of Colorado Anschutz Medical Campus</institution><addr-line>Aurora</addr-line><addr-line>CO</addr-line><country>United States</country></aff><aff id="aff3"><institution>Department of Biostatistics and Epidemiology, Rutgers School of Public Health</institution><addr-line>Piscataway</addr-line><addr-line>NJ</addr-line><country>United States</country></aff><aff id="aff4"><institution>Epidemiology Analytics, Janssen Research and Development</institution><addr-line>Titusville</addr-line><addr-line>NJ</addr-line><country>United States</country></aff><aff id="aff5"><institution>Department of Pediatrics, Seattle Children's Hospital</institution><addr-line>Seattle</addr-line><addr-line>WA</addr-line><country>United States</country></aff><aff id="aff6"><institution>Department of Information Science, Drexel University</institution><addr-line>Philadelphia</addr-line><addr-line>PA</addr-line><country>United States</country></aff><aff id="aff7"><institution>Department of Medicine, Johns Hopkins School of Medicine</institution><addr-line>Baltimore</addr-line><addr-line>MD</addr-line><country>United States</country></aff><aff id="aff8"><institution>Department of Population Health Sciences, Duke University School of Medicine</institution><addr-line>Durham</addr-line><addr-line>NC</addr-line><country>United States</country></aff><aff id="aff9"><institution>Biomedical Research Informatics Center, Nemours/Alfred I. DuPont Hospital for Children</institution><addr-line>Wilmington</addr-line><addr-line>DE</addr-line><country>United States</country></aff><aff id="aff10"><institution>Department of Population Health Sciences, Weill Cornell Medicine</institution><addr-line>New York</addr-line><addr-line>NY</addr-line><country>United States</country></aff><aff id="aff11"><institution>Department of Medical Informatics and Clinical Epidemiology, Oregon Health and Science University</institution><addr-line>Portland</addr-line><addr-line>OR</addr-line><country>United States</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Mavragani</surname><given-names>Amaryllis</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Almeida</surname><given-names>Aitor</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Hebert</surname><given-names>Courtney</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>&#xC784;</surname><given-names>&#xBBFC;&#xC2DD;</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Hanieh Razzaghi, MPH, PhD, Applied Clinical Research Center, Children's Hospital of Philadelphia, 3401 Civic Center Blvd, Philadelphia, PA, 19104, United States, 1 814-441-9659; <email>razzaghih@chop.edu</email></corresp></author-notes><pub-date pub-type="collection"><year>2026</year></pub-date><pub-date pub-type="epub"><day>14</day><month>4</month><year>2026</year></pub-date><volume>28</volume><elocation-id>e76398</elocation-id><history><date date-type="received"><day>23</day><month>04</month><year>2025</year></date><date date-type="rev-recd"><day>13</day><month>01</month><year>2026</year></date><date date-type="accepted"><day>28</day><month>01</month><year>2026</year></date></history><copyright-statement>&#x00A9; Hanieh Razzaghi, Kaleigh Wieand, Kimberley L Dickinson, Michael G Kahn, Jason Roy, Clair Blacketer, Dmitri A Christakis, Christopher B Forrest, Jane Greenberg, Harold P Lehmann, Keith A Marsolo, Jennifer Sciolla, Mark G Weiner, Nicole G Weiskopf, L Charles Bailey. Originally published in the Journal of Medical Internet Research (<ext-link ext-link-type="uri" xlink:href="https://www.jmir.org">https://www.jmir.org</ext-link>), 14.4.2026. </copyright-statement><copyright-year>2026</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in the Journal of Medical Internet Research (ISSN 1438-8871), is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://www.jmir.org/">https://www.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://www.jmir.org/2026/1/e76398"/><abstract><p>Secondary use of clinical data offers unprecedented opportunities to rapidly conduct large-scale research and improve patient care. However, incomplete understanding of data quality requirements for a study often causes significant delays in executing analyses and validating results. Current practice has largely followed 2 paths. First, multi-institutional networks have developed general data quality programs, but these are typically tied to unique network characteristics and do not address study-specific requirements well. Second, models have been proposed to formalize the requirements for data fitness analyses without extending to the methods needed to meet these requirements. More recently, tools have been developed to conduct cohort-centric screening, focusing on generally applicable structural checks such as missingness or facial implausibility. These provide a first level of information but incompletely capture the fitness requirements of an analysis. In turn, investigators conduct per-study exploratory analyses, but these efforts are typically ad hoc and partially reported, which can hinder reproducible science and delay advances in patient care. Analogously to advances over the past decade in data modeling and reproducible analytics, there is a need for a more systematic, capable approach to study-specific data quality assessment (SSDQA). We discuss such a model, which guides improved SSDQA design and implementation, including metadata for consistent annotation and reporting of data quality assessment results. The model integrates theoretical principles of data quality testing with pragmatic considerations of application to clinical data, providing a consistent approach to specifying data quality assessment checks. Additionally, it proposes to regularize check application through a standard set of options. The SSDQA model builds on current practice, providing a path toward more complete, sound, and reproducible assessments. These characteristics foster multidisciplinary collaboration to identify data quality issues that, in turn, inform decisions about study design and provide important context that has a bearing on adoption of results.</p></abstract><kwd-group><kwd>data quality</kwd><kwd>electronic health records</kwd><kwd>research readiness</kwd><kwd>automation</kwd><kwd>data fitness</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><sec id="s1-1"><title>Network Data Quality Assessment</title><p>Secondary use of clinical data offers unprecedented opportunities to conduct large-scale research and derive insights into patient care. This is particularly true when using data from multi-institutional networks such as PEDSnet [<xref ref-type="bibr" rid="ref1">1</xref>], PCORnet [<xref ref-type="bibr" rid="ref2">2</xref>], the National COVID Cohort Collaborative [<xref ref-type="bibr" rid="ref3">3</xref>], All of Us [<xref ref-type="bibr" rid="ref4">4</xref>], or others [<xref ref-type="bibr" rid="ref5">5</xref>-<xref ref-type="bibr" rid="ref9">9</xref>]. However, clinical data are complex, and problems related to data quality [<xref ref-type="bibr" rid="ref10">10</xref>-<xref ref-type="bibr" rid="ref16">16</xref>] have significantly hindered research across many networks. General data quality assessment (DQA) programs have been developed for identifying and resolving issues in real-world data [<xref ref-type="bibr" rid="ref17">17</xref>-<xref ref-type="bibr" rid="ref32">32</xref>], including in multi-institutional networks, but efforts have often been siloed by unique network characteristics, differences in data models, or lack of automation and reproducibility [<xref ref-type="bibr" rid="ref33">33</xref>-<xref ref-type="bibr" rid="ref35">35</xref>]. This tight coupling of DQA programs with existing infrastructure limits reuse in the wider community.</p><p>Data quality frameworks have traditionally focused on data model conformance, completeness, and plausibility components [<xref ref-type="bibr" rid="ref27">27</xref>] rather than targeting specific clinical content, which can miss significant threats to analytic validity. This is not simply a lack of effective implementation. Rather, it reflects that general DQA at the network level addresses separate analytic problems from study-specific DQA (SSDQA), although the 2 domains are related. For example, a network data quality metric may evaluate clinical procedures as a single group to ensure appropriate mappings and compute procedure records per patient. However, if a particular study relies on the presence of specific biopsy procedures, this can only provide a starting point. A gap in data extraction that maps these procedures to a valid but less specific code or misses these mappings entirely&#x2014;whether overall or for the study group specifically&#x2014;would not be discovered during broader network data quality checks. Further testing would be needed to assess whether the needed information could be recovered by changing extraction of procedure data or the presence of the biopsy must be inferred from other data. As a result, most network-focused data quality programs lack the flexibility needed in study-specific contexts. Furthermore, most current DQA research focuses primarily on harmonization of terms to <italic>describe</italic> data quality rather than developing standardized models to <italic>produce</italic> standardized assessments.</p></sec><sec id="s1-2"><title>Addressing Study-Specific Data Quality</title><p>While less widely considered, there have been several attempts to address the need for domain-specific or SSDQA. Efforts focused on regulatory science targeting the US Food and Drug Administration and European Medicines Agency have produced planning guides for evaluating fitness in real-world data, such as SPIFD (Structured Process to Identify Fit-for-Purpose Data) and STaRT-RWE (Structured Template for Planning and Reporting on the Implementation of Real-World Evidence Studies), which have considerably advanced the field [<xref ref-type="bibr" rid="ref36">36</xref>-<xref ref-type="bibr" rid="ref38">38</xref>]. Similar work derived from model-based software design has addressed analogous steps [<xref ref-type="bibr" rid="ref39">39</xref>]. These frameworks formalize study design and data requirements, underlining what is needed to produce reliable results for regulatory decision-making. However, providing specific methods or software to standardize and systematically report on findings was beyond the scope of these efforts. Furthermore, these and similar frameworks recognize the importance of well-delineated reporting on data quality but focus on reporting as part of a bespoke study design rather than building reusable informatics frameworks or tools.</p><p>Separately, several tools have taken pragmatic approaches to interrogating study datasets. Some address specific topics, such as missingness or &#x201C;never events&#x201D; in DQe-c [<xref ref-type="bibr" rid="ref40">40</xref>] or differences in case mix in the ENACT Data Quality Explorer [<xref ref-type="bibr" rid="ref35">35</xref>]. Another effort, dataquieR [<xref ref-type="bibr" rid="ref41">41</xref>,<xref ref-type="bibr" rid="ref42">42</xref>], focuses on highly flexible testing of a priori constraints; the resulting tools have a wide potential range but concretely address a smaller set of data quality needs and provide limited options to probe semantic issues that may arise in the data. A third direction, exemplified by the Observational Health Data Sciences and Informatics CohortDiagnostics package [<xref ref-type="bibr" rid="ref43">43</xref>], has been to transfer tests that are effective at the network level for application to study cohorts to produce more specific reporting.</p><p>While each of these efforts has been a step forward, evaluations of the current state of SSDQA for clinical data have consistently identified the need for more standard, applicable, automatable approaches [<xref ref-type="bibr" rid="ref33">33</xref>,<xref ref-type="bibr" rid="ref34">34</xref>]. A recent systematic review of DQAs and tools concluded that most SSDQAs are developed on a project-by-project basis and that this approach is not practical because of time and resource constraints [<xref ref-type="bibr" rid="ref33">33</xref>]. Finally, prior efforts have focused on solving a specific problem rather than developing an underlying model to serve a wide range of data quality requirements in a systematic and reproducible way.</p></sec><sec id="s1-3"><title>Systematizing SSDQA</title><p>The challenge, then, is that SSDQA <italic>content</italic> must be specific to the semantic needs of each analysis, whereas SSDQA <italic>methods</italic> need to be standardized and reproducible. The field has advanced significantly in the past decade, but current practice does not fully attain either of these goals. There remains a need to bridge gaps between broad network DQA, high-level proposals for SSDQA needs, and current ad hoc practice of SSDQA. In this paper, we discuss how this challenge can be met, building from existing science including our own previous work [<xref ref-type="bibr" rid="ref44">44</xref>] and expert consensus to articulate a model for a systematic approach to SSDQA and reporting. Specifically, we focus on the critical space between formalizing the high levels of SSDQA [<xref ref-type="bibr" rid="ref36">36</xref>-<xref ref-type="bibr" rid="ref39">39</xref>] and the specifics of implementing tests [<xref ref-type="bibr" rid="ref35">35</xref>,<xref ref-type="bibr" rid="ref40">40</xref>,<xref ref-type="bibr" rid="ref41">41</xref>], aiming to articulate methods that satisfy the former&#x2019;s needs and systematize approaches to the latter. We address (1) well-founded, analysis-aware data quality testing and (2) the construction of practical, reusable data quality metrics that can be widely adopted to drive consistent and concrete data quality check development. Advancing in these areas is important to clinical informaticians as methodologists and stewards of data resources; to clinical researchers, who are reliant on large datasets to produce valid results; and to patients and clinicians, who must be informed consumers of research results.</p></sec></sec><sec id="s2"><title>An Expanded Model for Assessing Data Fitness</title><sec id="s2-1"><title>Articulating Requirements for Clinical Data Fitness</title><p>We first articulate a set of requirements for advancing the practice of SSDQA. We include both high-level strategies, or goals that an effective model must reflect, as well as specific design principles that motivate the process of check construction and application, which are shown in <xref ref-type="other" rid="box1">Textbox 1</xref>.</p><boxed-text id="box1"><title> Design principles for developing study-specific data quality assessment.</title><list list-type="bullet"><list-item><p>Data quality assessment should correspond to the different stages of a research project, ranging from cohort selection to assessment of minor covariates.</p></list-item></list><list list-type="bullet"><list-item><p>The principles of a framework should be standardized and reusable across multiple study-specific contexts, and software packages should streamline implementation.</p></list-item></list><list list-type="bullet"><list-item><p>Data quality tests need descriptive terms that are specific and pair methods with output.</p></list-item></list><list list-type="bullet"><list-item><p>Data quality software packages should be configurable without the need for extensive coding.</p></list-item></list><list list-type="bullet"><list-item><p>Methods used in data quality checks should be interpretable, with a range of standard options to evaluate results.</p></list-item></list><list list-type="bullet"><list-item><p>Data quality checks should be pragmatic and informative, without requiring deep knowledge of underlying theory for use.</p></list-item></list><list list-type="bullet"><list-item><p>Temporality should be an important dimension for data quality checks and not be treated as a separate check itself.</p></list-item></list></boxed-text><p>We then synthesize these considerations into a concrete model (<xref ref-type="fig" rid="figure1">Figure 1</xref>) for constructing fitness-oriented data quality checks. The model provides consistent guidance for standardizing check types that evaluate data fitness across a range of study-specific requirements. It delineates the processes of check development and execution through two interdependent processes labeled on the left-hand side of <xref ref-type="fig" rid="figure1">Figure 1</xref>: (1) SSDQA development and tools, which focuses on check development and back-end automation; and (2) user process, which highlights the interaction between the user and data quality checks. We elaborate on these processes in the 2 sections below.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Model for systematic construction of study-specific data quality assessment (SSDQA) checks showing the conceptual tracks of check development and user process and the stages of initial specification followed by application.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v28i1e76398_fig01.png"/></fig></sec><sec id="s2-2"><title>Adding Rigor to SSDQA Development</title><p>To promote better coverage of SSDQA needs as well as improved understanding of what the measures produce, the development of a data quality check type should incorporate 4 separate but integrated components. The first is the clinical goal or assessment, which describes the clinical truth that the user needs to evaluate; this aligns in use with purposive frameworks such as SPIFD [<xref ref-type="bibr" rid="ref37">37</xref>]. For example, an investigator might posit that patients in a study cohort should have confirmatory clinical data (eg, end-stage renal disease and evidence of dialysis) to establish basic face validity. While check types are broader than specific study goals, what goals a check type could address is an important consideration. The second component is check category, which expresses the evaluated aspect of data quality; here we encompass current work focused on widely adopted harmonized terms [<xref ref-type="bibr" rid="ref45">45</xref>] such as completeness, plausibility, or conformance. The third component is a data quality probe, which, in contrast to the clinical goal, designates a data-centric purpose for a data quality check, such as assessing eligibility criteria or mapping errors in the data. Finally, analysis level describes the application of a check across levels of aggregation, including person-level, event-level, or visit-level analysis. The integration of these 4 components forms a check type, which is discoverable through these attributes; a set of metadata terms reflecting current practice is provided in <xref ref-type="other" rid="box2">Textbox 2</xref>. Use of these and similar terms allows check types to be findable for reuse in different studies and promotes interoperability and reusability in the comparison of results across studies.</p><boxed-text id="box2"><title> Metadata terms associated with attributes of study-specific data quality assessment check types.</title><p><bold>Check category</bold></p><list list-type="bullet"><list-item><p>Concordance</p></list-item><list-item><p>Conformance</p></list-item><list-item><p>Completeness</p></list-item><list-item><p>Consistency</p></list-item><list-item><p>Information representation</p></list-item><list-item><p>Plausibility</p></list-item></list><p><bold>Data quality probe</bold></p><list list-type="bullet"><list-item><p>Misclassification</p></list-item><list-item><p>Eligibility criteria</p></list-item><list-item><p>Missing expected data</p></list-item><list-item><p>Anomalous information density</p></list-item><list-item><p>Temporal inconsistency</p></list-item><list-item><p>Anomalous value from internal distribution</p></list-item><list-item><p>Data representation errors</p></list-item><list-item><p>External benchmarking</p></list-item><list-item><p>Selection error or bias</p></list-item></list><p><bold>Clinical goal or assessment</bold></p><list list-type="bullet"><list-item><p>Clinical consistency</p></list-item><list-item><p>Clinical follow-up</p></list-item><list-item><p>Clinical complexity</p></list-item><list-item><p>Confirmatory clinical data</p></list-item><list-item><p>Expected clinical event representation</p></list-item><list-item><p>Targeted patient population</p></list-item><list-item><p>Utilization patterns</p></list-item><list-item><p>Valid diagnostic criteria</p></list-item></list><p><bold>Analysis level</bold></p><list list-type="bullet"><list-item><p>Person level</p></list-item><list-item><p>Visit level</p></list-item><list-item><p>Event level</p></list-item></list></boxed-text><p>Once a check type has been specified, application to the data could be mediated by a common code base instantiating the desired analyses, spanning the set of user configurations (see below) required to execute a specific data quality check of that type. This approach, which we term &#x201C;data quality modules,&#x201D; facilitates consistent application of the check type through a set of base functions incorporating user choices via parameters without requiring reimplementation or code editing and the consequent risk of divergent computation across use cases [<xref ref-type="bibr" rid="ref46">46</xref>].</p></sec><sec id="s2-3"><title>Standardizing Application of DQA Checks</title><p>Application of study-specific data quality testing must by definition be adapted to each study&#x2019;s requirements. Data quality checks are deployed to inform decision-making about study design and method choices, assess the validity of the dataset, and identify potential biases or stratifications present in the data. Data quality requirements may vary based on the stage of the study or the background of the user. For example, data quality for a clinical researcher planning cohort selection will require a different set of evaluation measures from those for the validation of primary outcomes, and testing at the design stage will differ from a statistician investigating an anomalous result during analysis. This is denoted as &#x201C;study-specific stage&#x201D; in <xref ref-type="fig" rid="figure1">Figure 1</xref>.</p><p>In parallel, &#x201C;study-specific context&#x201D; compels users to formally define the data constructs (eg, cohort, variables, and concept sets) around which to identify potential gaps in data quality or constraints in study design that require testing. This step drives the creation of the &#x201C;study-specific&#x201D; requirements at the core of SSDQA application. On the basis of these criteria, users go on to select the appropriate check types to be applied to the desired data elements.</p><p>Three final decision points determine how a particular check is applied and interpreted. First, applications may focus on single- or multi-unit analysis. Most often, units are participating institutions, and multiunit output facilitates comparisons among institutions, whereas single-unit output permits more in-depth analysis within a particular institution. In other circumstances, it may be more correct to combine all institutions as a single unit of analysis or stratify by other characteristics, such as clinical specialty. Second, a particular data quality analysis may focus on manual exploration or be used to formally detect outliers. The former produces descriptive visualizations, which are particularly useful for assessing patterns relying on topical expertise. In contrast, the latter computes quantitative metrics to identify potential outliers, which is useful when assessing complex patterns or for automation. Third, users may wish to analyze their data longitudinally to understand changes over time or cross-sectionally covering a specified period to examine average effect.</p></sec><sec id="s2-4"><title>Augmenting the Scope of SSDQA</title><p>This model for SSDQA allows us to consider current practice centered on analytic requirements rather than software output and identify areas for expansion. We begin with a set of 18 data quality check types (<xref ref-type="table" rid="table1">Table 1</xref>) that cover foundational requirements for SSDQA; while not exhaustive, these check types span a large fraction of use cases encountered in clinical studies. Although many check types can be applied at multiple phases of a study, we group them here based on likely associations. For example, during the &#x201C;cohort identification&#x201D; phase of a study, research teams might deploy the &#x201C;attrition step&#x201D; or &#x201C;sensitivity to selection criteria&#x201D; check types. The primary difference between &#x201C;cohort fitness&#x201D; and &#x201C;dataset fitness&#x201D; is that the former evaluates the fitness of a specific cohort against study requirements, whereas the latter tests for the logical cohesion of a dataset. Therefore, anomalies found in the former require investigating study-specific criteria, whereas those found in the latter identify broader logical inconsistencies in a dataset. &#x201C;Data conformance&#x201D; check types evaluate a dataset for conformance to a required set of standards, such as ensuring that units for a particular laboratory test conform to the standard or accepted reporting for that test. &#x201C;Variable testing&#x201D; focuses on study variables, that is, a set of operational clinical definitions represented in an analysis. Nearly all check types can be reasonably configured for single-site vs multi-site analysis, exploratory output vs anomaly detection, and longitudinal vs cross-sectional analysis. As a result, a single check type may yield 8 different types of outputs.</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Catalog of check types derived from the study-specific data quality assessment model.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Check type</td><td align="left" valign="bottom">Check description</td><td align="left" valign="bottom">Metadata terms</td></tr></thead><tbody><tr><td align="left" valign="top" colspan="3">Cohort identification</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Attrition step</td><td align="left" valign="top">Assess heterogeneity in cohort selection through attrition steps</td><td align="left" valign="top">&#x201C;Plausibility,&#x201D; &#x201C;eligibility criteria,&#x201D; &#x201C;missing expected data,&#x201D; &#x201C;selection error or bias,&#x201D; &#x201C;target patient population,&#x201D; and &#x201C;person level&#x201D;</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Sensitivity to selection criteria</td><td align="left" valign="top">Evaluates the impact of alternate cohort definitions by comparing demographics, follow-up time, utilization patterns, outcomes, and other key variables</td><td align="left" valign="top">&#x201C;Consistency,&#x201D; &#x201C;eligibility criteria,&#x201D; &#x201C;selection error or bias detection,&#x201D; &#x201C;clinical consistency,&#x201D; &#x201C;target patient population,&#x201D; and &#x201C;person level&#x201D;</td></tr><tr><td align="left" valign="top" colspan="3">Cohort fitness</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Patient facts</td><td align="left" valign="top">Assesses the availability of clinical events such as drug utilization or laboratory events per year of follow-up</td><td align="left" valign="top">&#x201C;Completeness,&#x201D; &#x201C;missing expected data,&#x201D; &#x201C;anomalous information density,&#x201D; &#x201C;clinical follow-up,&#x201D; &#x201C;utilization patterns,&#x201D; and &#x201C;visit level&#x201D;</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Patient event sequencing</td><td align="left" valign="top">Evaluates the plausibility of dates in relation to clinical events (eg, chronic kidney disease diagnosis precedes order for dialysis)</td><td align="left" valign="top">&#x201C;Consistency,&#x201D; &#x201C;plausibility,&#x201D; &#x201C;misclassification,&#x201D; &#x201C;expected clinical event representation,&#x201D; &#x201C;confirmatory clinical data,&#x201D; &#x201C;utilization patterns,&#x201D; and &#x201C;person level&#x201D;</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Patient record consistency</td><td align="left" valign="top">Tests whether the clinical data in a patient&#x2019;s record are consistent and confirmatory (eg, patients with leukemia should be receiving chemotherapy)</td><td align="left" valign="top">&#x201C;Consistency,&#x201D; &#x201C;plausibility,&#x201D; &#x201C;missing expected data,&#x201D; &#x201C;clinical consistency,&#x201D; &#x201C;confirmatory clinical data,&#x201D; &#x201C;utilization patterns,&#x201D; &#x201C;valid diagnostic criteria,&#x201D; and &#x201C;person level&#x201D;</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Expected facts present</td><td align="left" valign="top">Evaluates whether a cohort has specific types of clinical data, such as BMI, office visit procedures, or vital signs</td><td align="left" valign="top">&#x201C;Completeness,&#x201D; &#x201C;missing expected data,&#x201D; &#x201C;valid diagnostic criteria,&#x201D; &#x201C;expected clinical event representation,&#x201D; and &#x201C;visit level&#x201D;</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Clinical data values and ranges</td><td align="left" valign="top">Assesses whether clinical data outcomes or values align with patient cohort characteristics</td><td align="left" valign="top">&#x201C;Consistency,&#x201D; &#x201C;misclassification,&#x201D; &#x201C;confirmatory clinical data,&#x201D; &#x201C;valid diagnostic criteria,&#x201D; and &#x201C;person level&#x201D;</td></tr><tr><td align="left" valign="top" colspan="3">Dataset fitness</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Clinical events and specialty agreement</td><td align="left" valign="top">Evaluates the concordance of specific events (eg, diagnosis of type 1 diabetes) and specialist clinicians or care sites (eg, endocrinologist or endocrinology clinic) at the visit level</td><td align="left" valign="top">&#x201C;Concordance,&#x201D; &#x201C;data representation errors,&#x201D; &#x201C;confirmatory clinical data,&#x201D; &#x201C;utilization patterns,&#x201D; and &#x201C;visit level&#x201D;</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Visit clinical data agreement</td><td align="left" valign="top">Determines whether expected clinical events occur within the same clinical encounter (eg, ventilator support in the ICU<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup> or initial antihypertensive prescription and blood pressure reading)</td><td align="left" valign="top">&#x201C;Concordance,&#x201D; &#x201C;missing expected data,&#x201D; &#x201C;clinical consistency,&#x201D; &#x201C;expected clinical event representation,&#x201D; and &#x201C;visit level&#x201D;</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Date sequencing</td><td align="left" valign="top">Detects outliers and anomalous values such as dates too far in the past or future and whether dates assigned to clinical events occur in a reasonable order or proximity</td><td align="left" valign="top">&#x201C;Plausibility,&#x201D; &#x201C;temporality inconsistency,&#x201D; &#x201C;clinical consistency,&#x201D; &#x201C;utilization patterns,&#x201D; and &#x201C;event level&#x201D;</td></tr><tr><td align="left" valign="top" colspan="3">Data conformance</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Clinical metadata</td><td align="left" valign="top">Assesses whether clinical facts are accompanied by appropriate metadata (eg, prescription drugs with dosing information and laboratory values with assigned specimens)</td><td align="left" valign="top">&#x201C;Conformance,&#x201D; &#x201C;missing expected data,&#x201D; &#x201C;data representation errors,&#x201D; &#x201C;expected clinical event representation,&#x201D; and &#x201C;event level&#x201D;</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Unit and value alignment</td><td align="left" valign="top">Determines whether drug prescriptions and administrations or laboratory results contain units that are conformant to expected standards</td><td align="left" valign="top">&#x201C;Conformance,&#x201D; &#x201C;misclassification,&#x201D; &#x201C;clinical consistency,&#x201D; and &#x201C;event level&#x201D;</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Duplicate record check</td><td align="left" valign="top">Identifies where there are duplicate rows or values in a given dataset</td><td align="left" valign="top">&#x201C;Conformance,&#x201D; &#x201C;anomalous information density,&#x201D; &#x201C;utilization patterns,&#x201D; and &#x201C;event level&#x201D;</td></tr><tr><td align="left" valign="top" colspan="3">Variable testing</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Expected variables present</td><td align="left" valign="top">Checks for the presence of expected variables and presents a variety of distributions of these variables in the dataset</td><td align="left" valign="top">&#x201C;Completeness,&#x201D; &#x201C;missing expected data,&#x201D; &#x201C;external benchmarking,&#x201D; &#x201C;expected clinical event representation,&#x201D; &#x201C;valid diagnostic criteria,&#x201D; and &#x201C;person level&#x201D;</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Quantitative variable distributions</td><td align="left" valign="top">Evaluates quantitative distributions of clinical values (eg, laboratory values or BMI) or patient characteristics (eg, number of visits per patient or follow-up time)</td><td align="left" valign="top">&#x201C;Plausibility,&#x201D; &#x201C;anomalous value from internal distribution,&#x201D; &#x201C;valid diagnostic criteria,&#x201D; &#x201C;clinical consistency,&#x201D; and &#x201C;event level&#x201D;</td></tr><tr><td align="left" valign="top" colspan="3">Concept set testing</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Concept set distribution</td><td align="left" valign="top">Examines distributions of concepts (eg, codes) that represent a particular variable (eg, wheezing or uncomplicated asthma are the most common concepts represented in primary care cohort, but severe asthma may be the most common in patients with prolonged respiratory specialty care)</td><td align="left" valign="top">&#x201C;Information representation,&#x201D; &#x201C;data representation errors,&#x201D; &#x201C;expected clinical event representation,&#x201D; and &#x201C;event level&#x201D;</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Source and concept vocabularies</td><td align="left" valign="top">Provides source-to-concept mappings and their distributions to identify potential problems related to information loss</td><td align="left" valign="top">&#x201C;Information representation,&#x201D; &#x201C;data representation errors,&#x201D; &#x201C;clinical consistency,&#x201D; and &#x201C;event level&#x201D;</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Unmapped concepts</td><td align="left" valign="top">Shows the potential impact of unmapped concepts within a dataset</td><td align="left" valign="top">&#x201C;Information representation,&#x201D; &#x201C;misclassification,&#x201D; &#x201C;missing expected data,&#x201D; &#x201C;expected clinical event representation,&#x201D; and &#x201C;event level&#x201D;</td></tr></tbody></table><table-wrap-foot><fn id="table1fn1"><p><sup>a</sup>ICU: intensive care unit.</p></fn></table-wrap-foot></table-wrap><p>Using this taxonomy, we see that current DQA tools operate largely within a few types. Tests for missingness or data density, a focus of many packages [<xref ref-type="bibr" rid="ref26">26</xref>,<xref ref-type="bibr" rid="ref27">27</xref>,<xref ref-type="bibr" rid="ref40">40</xref>,<xref ref-type="bibr" rid="ref41">41</xref>,<xref ref-type="bibr" rid="ref43">43</xref>], fall within the &#x201C;expected facts present&#x201D; type or, in some cases, assess &#x201C;clinical metadata&#x201D; [<xref ref-type="bibr" rid="ref27">27</xref>]. Tests for never-valid implausibility [<xref ref-type="bibr" rid="ref26">26</xref>,<xref ref-type="bibr" rid="ref27">27</xref>,<xref ref-type="bibr" rid="ref41">41</xref>,<xref ref-type="bibr" rid="ref43">43</xref>] most often assess date sequencing or, in some cases, patient record consistency. Conformance-oriented packages [<xref ref-type="bibr" rid="ref17">17</xref>,<xref ref-type="bibr" rid="ref27">27</xref>,<xref ref-type="bibr" rid="ref43">43</xref>] may add to these tests matching the &#x201C;unmapped concepts&#x201D; and less often the &#x201C;unit and value alignment&#x201D; or &#x201C;duplicate record&#x201D; check types. The ENACT Data Quality Explorer focuses on diagnosis code distribution [<xref ref-type="bibr" rid="ref35">35</xref>], a form of the &#x201C;expected facts present&#x201D; type, as an opportunity to infer other data quality gaps.</p><p>Viewed through this lens, it becomes clear that there is good coverage of structural and study-autonomous aspects of data quality that can disrupt analyses, but there are few resources for systematic interrogation of the clinical semantics that are equally critical to study validity. It is in these areas that we believe that the calls for increased standardization in data quality testing are most apt [<xref ref-type="bibr" rid="ref33">33</xref>,<xref ref-type="bibr" rid="ref34">34</xref>].</p></sec><sec id="s2-5"><title>Applying Well-Formed DQA to Clinical Research</title><sec id="s2-5-1"><title>Identifying Requirements During Study Design</title><p>To illustrate how the approach that we discuss can improve clinical studies, we examine a specific need in clinical research design: assessing whether persons and data available for a study correctly identify the at-risk population, that is, form a clinically reasonable basis to ask the research question. There are several implicit aspects of this question, including the following: (1) Does the cohort accurately reflect the medical state of interest? (2) Are there undiscovered selection biases or stratifications in the cohort? (3) Are the data elements needed to find patients present? For this discussion, we will focus on the specific question of whether the cohort behaves as one would expect for eligible patients; that is, does it have facial and possibly construct validity? The study development path we propose encapsulates these ideas in a single check type that evaluates whether the <italic>variables</italic> (ie, clinical concepts) that a study&#x2019;s design expects to find are present in a study dataset (check type: expected variables present), applied during the cohort identification and fitness phases of the study. We note that this is different from the more commonly addressed question of whether <italic>data elements</italic> (ie, labels that ideally capture specific facts for the study) are present; this would follow the &#x201C;expected facts present&#x201D; type. The model distinguishes between the 2 because they are semantically different, notwithstanding the fact that they are often confused in practice. Several other check types will be germane to the study&#x2019;s process of fitness testing. In turn, this should not only inform study design but also produce knowledge that allows others to compare this to other studies and design newer studies more efficiently. However, for the sake of this example, we will limit discussion to 1 check type. We present this as an illustration of how the framework might be used rather than as a systematic evaluation of its effect.</p><p>Having identified a relevant check type, we describe the user process phase in a hypothetical study of children with sickle cell disease (SCD). This is an uncommon condition in the general population and, hence, unlikely to drive findings in network-wide data quality testing. However, it is a major cause of both health burden and health care utilization in affected children. For this example, we set 4 expectations: evidence of transcranial Doppler procedure, a standard practice to assess risk of stroke; selected SCD-related laboratory results; pain diagnoses; and hydroxyurea prescription.</p></sec><sec id="s2-5-2"><title>Single-Unit Analysis</title><p>The single-unit &#x201C;expected variables present&#x201D; visualization (<xref ref-type="fig" rid="figure2">Figure 2</xref>) allows users to explore data quality as a single unit rather than a comparison of entities to evaluate the dataset in aggregate. <xref ref-type="fig" rid="figure2">Figures 2A and 2B</xref> highlight the difference between exploratory visualizations and formal anomaly detection. In this example, distributions of variables indicate that more patients in this cohort have an associated hemoglobin test result (approximately 90%) than other SCD-related tests (approximately 73%). However, there is also a high degree of overlap between patients with hemoglobin and other tests (<xref ref-type="fig" rid="figure2">Figure 2B</xref>), and the clinical expectation is that all patients followed for SCD will have results from both sets of tests. This would imply that data quality problems are more common for capturing or representing SCD-specific rather than general test results, as the former are required for a smaller set of patients. Analogously, the use of transcranial Doppler ultrasound, indicated for patients with the more physiologically severe SCD-SS and SCD-S&#x03B2;<sup>0</sup> variants of SCD, correlates fairly well with patients who are taking hydroxyurea, indicated for the same subset of patients, suggesting that patient data are more likely to be complete in this subset of patients requiring more follow-up and clinical observation. The low proportion of patients prescribed hydroxyurea overall may in itself point to a data quality problem, as the drug is indicated for all patients in this subcohort. Possible explanations are that the cohort criteria are too broad (SCD-SS and SCD-S&#x03B2;<sup>0</sup> represent 65%-75% of all patients with SCD) and the drug has only been formally recommended in pediatrics since 2017.</p><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>Single-unit results from an &#x201C;expected values present&#x201D; study-specific data quality assessment check type: (A) exploratory visualization showing the proportion of the cohort having each of several clinical characteristics that would be expected for the study design, (B) exploratory visualization showing co-occurrence of variable pairs (the Jaccard index is able to account for incomplete capture of each variable, and self-comparisons are omitted), (C) longitudinal trends in completeness of several expected data elements, and (D) anomaly detection using control charts applied to longitudinal trend data.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v28i1e76398_fig02.png"/></fig><p><xref ref-type="fig" rid="figure2">Figures 2C and 2D</xref> show temporal trends for the selected variables, which may elucidate whether the low observed proportions for hydroxyurea use are driven exclusively by approval date. <xref ref-type="fig" rid="figure2">Figure 2C</xref> is a time-series analysis where all variables are plotted as a proportion of eligible patients over time. It presents relatively stable rates for hemoglobin testing and pain diagnoses, steady increases over time in both transcranial Doppler and hydroxyurea use that reflect updated clinical patient guidelines, and relatively sharp increases in 2013 for laboratory testing variables that may suggest timing of electronic health record implementation rather than differences in care or utilization patterns. <xref ref-type="fig" rid="figure2">Figure 2D</xref> shows control charts, with anomalies indicated by orange circles. These graphs reflect similar trends to those exhibited in the exploratory graphs; the hydroxyurea output in particular indicates that the 2017 approval influenced prescribing patterns, although multi-institutional data may clarify whether trends and proportions are heterogeneous. Overall health care utilization still remains lower than expected at approximately 70%.</p></sec><sec id="s2-5-3"><title>Multi-Unit Analysis</title><p><xref ref-type="fig" rid="figure3">Figure 3</xref> again shows both the exploratory and anomaly detection visualizations for the same variables in a multiunit (site contributor) analysis. The exploratory visualization shows a heat map with proportions of patients who have evidence of the indicated variable in the dataset (<xref ref-type="fig" rid="figure3">Figure 3A</xref>). The display showing tests for anomalous values shows a dot plot with the same color scale but with a star indicating statistical anomalies and the size of the dot indicating the mean proportion across all institutions (<xref ref-type="fig" rid="figure3">Figure 3B</xref>). The size and color of the hemoglobin results indicate that all sites capture this variable well, ranging from 78% (site A) to 95% (site J). Site A shows lower-than-expected capture of non&#x2013;diagnosis-related clinical data, whereas site D captures laboratory tests well for patients but, conversely, captures pain diagnoses, hydroxyurea use, and transcranial Doppler poorly compared with other sites. Notably, site K captures all use well except for transcranial Doppler, likely marking a problem with extracting procedure data from clinical source systems. These graphs illustrate the most frequent anomalies for transcranial Doppler, SCD-specific laboratory tests, and hydroxyurea utilization variables. Hydroxyurea in particular illustrates how heterogeneous data capture or clinical practice might appear, with sites B and D showing low representation (approximately 11%) and sites H and K showing high representation (42% and 52%, respectively).</p><fig position="float" id="figure3"><label>Figure 3.</label><caption><p>Multiunit results from applying an &#x201C;expected values present&#x201D; study-specific data quality assessment check type: (A) exploratory visualization showing the proportion of the cohort having each of several clinical characteristics that would be expected for the study design, now stratified by institution; (B) analogous visualization identifying sites with statistically anomalous proportions for each variable; (C) smoothed (locally estimated scatterplot smoothing [LOESS] with 95% CI) longitudinal trends in the proportion of patients receiving hydroxyurea; and (D) use of the Euclidean distance to summarize the differences in longitudinal trends shown in the prior graph to facilitate identification of outliers.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v28i1e76398_fig03.png"/></fig><p>Finally, we illustrate trends across time, facilitating deeper exploration of the heterogeneity shown in the overall results. Taking hydroxyurea use as an example, <xref ref-type="fig" rid="figure3">Figure 3C</xref> reinforces the single-unit observation that the Food and Drug Administration drug approval in 2017 increased use across time. Site J appears the most impacted, with a sharp uptick in 2017 and stable use afterward, in contrast to other sites&#x2019; steady rise. The longitudinal data also reinforce the observation that sites B and D have a low proportion of patients prescribed hydroxyurea, and while slightly increasing in more recent years, the proportions remain nearly constant. Site B is shown with high-quality data in other domains, as would be observed with drug-specific data quality problems. In contrast, site D shows anomalous data across domains. <xref ref-type="fig" rid="figure3">Figure 3D</xref> displays the overall distance between each site and the all-site median across time. This visualization reveals that sites K, B, and D trend markedly differently from the all-site median, with site K trending higher and, as expected, sites B and D trending lower.</p><p>Taken together, this single check type (expected variables present) can produce diagnostic outputs that inform decision-making about several potential data quality problems related to study-specific requirements. Some of these findings would doubtless be identified by a study team performing ad hoc exploratory analyses but likely not all, and certainly with limited potential for cross-study reuse of either tests or results. Importantly, we describe check types designed to augment rather than replace the type of information available from network-wide DQA, allowing for interrogation of semantics critical to proposed analyses and not just data structure.</p></sec></sec></sec><sec id="s3"><title>Summary</title><p>Structuring the ways in which we approach SSDQA contributes to better and more systematic research in several ways. First, checks can be driven by a theoretically sound framework that integrates clinical and technical considerations while retaining the opportunity to configure how it is applied to meet the requirements of the user. Second, the adoption of a common framework enables reproducibility and standardization of SSDQA checks for dissemination in a wider community in keeping with principles of findability, accessibility, interoperability, and reusability [<xref ref-type="bibr" rid="ref47">47</xref>,<xref ref-type="bibr" rid="ref48">48</xref>]. Third, the adoption of a shared and well-grounded framework further facilitates transparency about data beyond simple descriptions when sharing of datasets is not possible due to patient privacy considerations. Consumers of research results are better able to understand data characteristics that may have a bearing on the applicability of the results to their circumstances. Similarly, other researchers can account for differences in data source to validate or augment previous work. For example, complex differences in use of the health system across cohorts may bias study outcomes, which is only crudely reported in current publications.</p><p>The problem is complex, and efforts will necessarily be incremental for many reasons. First, the universe of potential SSDQA testing is quite large, and while the theoretical model we discuss is expansible, a finite set of check types cannot cover all possible requirements. However, SSDQA for any single study will exercise only a subset of those possibilities; the check types we highlight cover most high-frequency cases. Second, effective SSDQA requires an interdisciplinary team comprising technical, methodological, and clinical domain expertise; systematizing DQA methods does not replace the need for this expertise. Nonetheless, for such teams, a repeatable approach is particularly useful both for study-wide screening for risks and when a DQA issue is suspected. For the former, check types for cohort identification and variable exploration may inform study design, variable definitions, and analytic requirements. For the latter, data anomalies can be investigated during a study in a more predictable way that will not require designing novel checks when time is of the essence. Third, a greater diversity of check types brings with it complexity, requiring familiarization to make the most effective use of available resources. While adoption of well-founded methods is advantageous in the longer term, a culture shift will be needed in areas where practice is currently ad hoc. Finally, our discussion currently focuses on structured data. Given the rapid increase in the use of free-text or imaging data, translation of these ideas will be needed. Nevertheless, the check type components apply across data domains even if the subsequent development of tools differs.</p><p>Effective SSDQA is a critical but complex part of conducting valid research using clinical data. As with other steps of the research process, SSDQA benefits from evolving methodological practices. We provide an opportunity to reflect on ways to improve SSDQA effectiveness and reporting. Additional work will undoubtedly be needed as clinical research methods and data sources continue to advance.</p></sec></body><back><ack><p>The authors are grateful for the contributions of data scientists and researchers who shared their experience assessing study-specific data quality. They are also grateful to the patients, families, and clinicians in networks such as PEDSnet who make large-scale learning about ways to improve health possible.</p></ack><notes><sec><title>Funding</title><p>This work was funded by Patient-Centered Outcomes Research Institute award ME-2020C3-21199. The funder played no role in the design of the study; method development; collection, analysis, or interpretation of the data; writing of the manuscript; or decision to submit it for publication. The views expressed in this paper do not represent those of the Patient-Centered Outcomes Research Institute or participating institutions.</p></sec></notes><fn-group><fn fn-type="conflict"><p>JS is a current member of the Ronald McDonald House Charities of Greater Delaware, as well as a consultant for the Association of Child Life Professionals. C Blacketer is an employee of Johnson &#x0026; Johnson, holds stock and stock options, and is involved in the development of the Observational Health Data Sciences and Informatics Data Quality Dashboard and CohortDiagnostics tools. KAM is involved in the development of the PCORnet Data Curation package. These conflicts did not interfere with the work performed in this project. No authors have any financial interest in the software tools developed during this work, nor have they been involved in the development of other software evaluated in this study. All other authors declare no other conflicts of interest.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">DQA</term><def><p>data quality assessment</p></def></def-item><def-item><term id="abb2">SCD</term><def><p>sickle cell disease</p></def></def-item><def-item><term id="abb3">SPIFD</term><def><p>Structured Process to Identify Fit-for-Purpose Data</p></def></def-item><def-item><term id="abb4">SSDQA</term><def><p>study-specific data quality assessment</p></def></def-item><def-item><term id="abb5">STaRT-RWE</term><def><p>Structured Template for Planning and Reporting on the Implementation of Real-World Evidence Studies</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Forrest</surname><given-names>CB</given-names> </name><name name-style="western"><surname>Margolis</surname><given-names>PA</given-names> </name><name name-style="western"><surname>Bailey</surname><given-names>LC</given-names> </name><etal/></person-group><article-title>PEDSnet: a national pediatric learning health system</article-title><source>J Am Med Inform Assoc</source><year>2014</year><volume>21</volume><issue>4</issue><fpage>602</fpage><lpage>606</lpage><pub-id pub-id-type="doi">10.1136/amiajnl-2014-002743</pub-id><pub-id pub-id-type="medline">24821737</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Forrest</surname><given-names>CB</given-names> </name><name name-style="western"><surname>McTigue</surname><given-names>KM</given-names> </name><name name-style="western"><surname>Hernandez</surname><given-names>AF</given-names> </name><etal/></person-group><article-title>PCORnet&#x00AE; 2020: current state, accomplishments, and future directions</article-title><source>J Clin Epidemiol</source><year>2021</year><month>01</month><volume>129</volume><fpage>60</fpage><lpage>67</lpage><pub-id pub-id-type="doi">10.1016/j.jclinepi.2020.09.036</pub-id><pub-id pub-id-type="medline">33002635</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Haendel</surname><given-names>MA</given-names> </name><name name-style="western"><surname>Chute</surname><given-names>CG</given-names> </name><name name-style="western"><surname>Bennett</surname><given-names>TD</given-names> </name><etal/></person-group><article-title>The National COVID Cohort Collaborative (N3C): rationale, design, infrastructure, and deployment</article-title><source>J Am Med Inform Assoc</source><year>2021</year><month>03</month><day>1</day><volume>28</volume><issue>3</issue><fpage>427</fpage><lpage>443</lpage><pub-id pub-id-type="doi">10.1093/jamia/ocaa196</pub-id><pub-id pub-id-type="medline">32805036</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><collab>All of Us Research Program Investigators</collab><name name-style="western"><surname>Denny</surname><given-names>JC</given-names> </name><name name-style="western"><surname>Rutter</surname><given-names>JL</given-names> </name><etal/></person-group><article-title>The &#x201C;All of Us&#x201D; research program</article-title><source>N Engl J Med</source><year>2019</year><month>08</month><day>15</day><volume>381</volume><issue>7</issue><fpage>668</fpage><lpage>676</lpage><pub-id pub-id-type="doi">10.1056/NEJMsr1809937</pub-id><pub-id pub-id-type="medline">31412182</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Tarabichi</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Frees</surname><given-names>A</given-names> </name><name name-style="western"><surname>Honeywell</surname><given-names>S</given-names> </name><etal/></person-group><article-title>The Cosmos collaborative: a vendor-facilitated electronic health record data aggregation platform</article-title><source>ACI Open</source><year>2021</year><month>01</month><volume>5</volume><issue>1</issue><fpage>e36</fpage><lpage>e46</lpage><pub-id pub-id-type="doi">10.1055/s-0041-1731004</pub-id><pub-id pub-id-type="medline">35071993</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Drozda</surname><given-names>JP</given-names>  <suffix>Jr</suffix></name><name name-style="western"><surname>Graham</surname><given-names>J</given-names> </name><name name-style="western"><surname>Muhlestein</surname><given-names>JB</given-names> </name><etal/></person-group><article-title>Multi-institutional distributed data networks for real-world evidence about medical devices: building unique device identifiers into longitudinal data (BUILD)</article-title><source>JAMIA Open</source><year>2022</year><volume>5</volume><issue>2</issue><fpage>ooac035</fpage><pub-id pub-id-type="doi">10.1093/jamiaopen/ooac035</pub-id><pub-id pub-id-type="medline">35663113</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Pedrera-Jimenez</surname><given-names>M</given-names> </name><name name-style="western"><surname>Garcia-Barrio</surname><given-names>N</given-names> </name><name name-style="western"><surname>Hernandez-Ibarburu</surname><given-names>G</given-names> </name><etal/></person-group><article-title>Building an i2b2-based population repository for COVID-19 research</article-title><source>Stud Health Technol Inform</source><year>2022</year><month>05</month><day>25</day><volume>294</volume><fpage>287</fpage><lpage>291</lpage><pub-id pub-id-type="doi">10.3233/SHTI220460</pub-id><pub-id pub-id-type="medline">35612078</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ohno-Machado</surname><given-names>L</given-names> </name><name name-style="western"><surname>Agha</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Bell</surname><given-names>DS</given-names> </name><etal/></person-group><article-title>pSCANNER: patient-centered Scalable National Network for Effectiveness Research</article-title><source>J Am Med Inform Assoc</source><year>2014</year><volume>21</volume><issue>4</issue><fpage>621</fpage><lpage>626</lpage><pub-id pub-id-type="doi">10.1136/amiajnl-2014-002751</pub-id><pub-id pub-id-type="medline">24780722</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Desai</surname><given-names>RJ</given-names> </name><name name-style="western"><surname>Marsolo</surname><given-names>K</given-names> </name><name name-style="western"><surname>Smith</surname><given-names>J</given-names> </name><etal/></person-group><article-title>The FDA Sentinel Real World Evidence Data Enterprise (RWE-DE)</article-title><source>Pharmacoepidemiol Drug Saf</source><year>2024</year><month>10</month><volume>33</volume><issue>10</issue><fpage>e70028</fpage><pub-id pub-id-type="doi">10.1002/pds.70028</pub-id><pub-id pub-id-type="medline">39385712</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Joshua Lin</surname><given-names>K</given-names> </name><name name-style="western"><surname>Jin</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Gagne</surname><given-names>J</given-names> </name><etal/></person-group><article-title>Longitudinal data discontinuity in electronic health records and consequences for medication effectiveness studies</article-title><source>Clin Pharmacol Ther</source><year>2022</year><month>01</month><volume>111</volume><issue>1</issue><fpage>243</fpage><lpage>251</lpage><pub-id pub-id-type="doi">10.1002/cpt.2400</pub-id><pub-id pub-id-type="medline">34424534</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Koffman</surname><given-names>L</given-names> </name><name name-style="western"><surname>Levis</surname><given-names>AW</given-names> </name><name name-style="western"><surname>Arterburn</surname><given-names>D</given-names> </name><etal/></person-group><article-title>Investigating bias from missing data in an electronic health records-based study of weight loss after bariatric surgery</article-title><source>Obes Surg</source><year>2021</year><month>05</month><volume>31</volume><issue>5</issue><fpage>2125</fpage><lpage>2135</lpage><pub-id pub-id-type="doi">10.1007/s11695-021-05226-y</pub-id><pub-id pub-id-type="medline">33462670</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Boyd</surname><given-names>AD</given-names> </name><name name-style="western"><surname>Gonzalez-Guarda</surname><given-names>R</given-names> </name><name name-style="western"><surname>Lawrence</surname><given-names>K</given-names> </name><etal/></person-group><article-title>Potential bias and lack of generalizability in electronic health record data: reflections on health equity from the National Institutes of Health Pragmatic Trials Collaboratory</article-title><source>J Am Med Inform Assoc</source><year>2023</year><month>08</month><day>18</day><volume>30</volume><issue>9</issue><fpage>1561</fpage><lpage>1566</lpage><pub-id pub-id-type="doi">10.1093/jamia/ocad115</pub-id><pub-id pub-id-type="medline">37364017</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hubbard</surname><given-names>RA</given-names> </name><name name-style="western"><surname>Lett</surname><given-names>E</given-names> </name><name name-style="western"><surname>Ho</surname><given-names>GY</given-names> </name><name name-style="western"><surname>Chubak</surname><given-names>J</given-names> </name></person-group><article-title>Characterizing bias due to differential exposure ascertainment in electronic health record data</article-title><source>Health Serv Outcomes Res Methodol</source><year>2021</year><month>09</month><volume>21</volume><issue>3</issue><fpage>309</fpage><lpage>323</lpage><pub-id pub-id-type="doi">10.1007/s10742-020-00235-3</pub-id><pub-id pub-id-type="medline">34366704</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Harton</surname><given-names>J</given-names> </name><name name-style="western"><surname>Mitra</surname><given-names>N</given-names> </name><name name-style="western"><surname>Hubbard</surname><given-names>RA</given-names> </name></person-group><article-title>Informative presence bias in analyses of electronic health records-derived data: a cautionary note</article-title><source>J Am Med Inform Assoc</source><year>2022</year><month>06</month><day>14</day><volume>29</volume><issue>7</issue><fpage>1191</fpage><lpage>1199</lpage><pub-id pub-id-type="doi">10.1093/jamia/ocac050</pub-id><pub-id pub-id-type="medline">35438796</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Haneuse</surname><given-names>S</given-names> </name><name name-style="western"><surname>Arterburn</surname><given-names>D</given-names> </name><name name-style="western"><surname>Daniels</surname><given-names>MJ</given-names> </name></person-group><article-title>Assessing missing data assumptions in EHR-based studies: a complex and underappreciated task</article-title><source>JAMA Netw Open</source><year>2021</year><month>02</month><day>1</day><volume>4</volume><issue>2</issue><fpage>e210184</fpage><pub-id pub-id-type="doi">10.1001/jamanetworkopen.2021.0184</pub-id><pub-id pub-id-type="medline">33635321</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Weiskopf</surname><given-names>NG</given-names> </name><name name-style="western"><surname>Dorr</surname><given-names>DA</given-names> </name><name name-style="western"><surname>Jackson</surname><given-names>C</given-names> </name><name name-style="western"><surname>Lehmann</surname><given-names>HP</given-names> </name><name name-style="western"><surname>Thompson</surname><given-names>CA</given-names> </name></person-group><article-title>Healthcare utilization is a collider: an introduction to collider bias in EHR data reuse</article-title><source>J Am Med Inform Assoc</source><year>2023</year><month>04</month><day>19</day><volume>30</volume><issue>5</issue><fpage>971</fpage><lpage>977</lpage><pub-id pub-id-type="doi">10.1093/jamia/ocad013</pub-id><pub-id pub-id-type="medline">36752649</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Blacketer</surname><given-names>C</given-names> </name><name name-style="western"><surname>Defalco</surname><given-names>FJ</given-names> </name><name name-style="western"><surname>Ryan</surname><given-names>PB</given-names> </name><name name-style="western"><surname>Rijnbeek</surname><given-names>PR</given-names> </name></person-group><article-title>Increasing trust in real-world evidence through evaluation of observational data quality</article-title><source>J Am Med Inform Assoc</source><year>2021</year><month>09</month><day>18</day><volume>28</volume><issue>10</issue><fpage>2251</fpage><lpage>2257</lpage><pub-id pub-id-type="doi">10.1093/jamia/ocab132</pub-id><pub-id pub-id-type="medline">34313749</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Castellanos</surname><given-names>EH</given-names> </name><name name-style="western"><surname>Wittmershaus</surname><given-names>BK</given-names> </name><name name-style="western"><surname>Chandwani</surname><given-names>S</given-names> </name></person-group><article-title>Raising the bar for real-world data in oncology: approaches to quality across multiple dimensions</article-title><source>JCO Clin Cancer Inform</source><year>2024</year><month>01</month><volume>8</volume><fpage>e2300046</fpage><pub-id pub-id-type="doi">10.1200/CCI.23.00046</pub-id><pub-id pub-id-type="medline">38241599</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Feder</surname><given-names>SL</given-names> </name></person-group><article-title>Data quality in electronic health records research: quality domains and assessment methods</article-title><source>West J Nurs Res</source><year>2018</year><month>05</month><volume>40</volume><issue>5</issue><fpage>753</fpage><lpage>766</lpage><pub-id pub-id-type="doi">10.1177/0193945916689084</pub-id><pub-id pub-id-type="medline">28322657</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Huser</surname><given-names>V</given-names> </name><name name-style="western"><surname>Li</surname><given-names>X</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>Z</given-names> </name><etal/></person-group><article-title>Extending Achilles heel data quality tool with new rules informed by multi-site data quality comparison</article-title><source>Stud Health Technol Inform</source><year>2019</year><month>08</month><day>21</day><volume>264</volume><fpage>1488</fpage><lpage>1489</lpage><pub-id pub-id-type="doi">10.3233/SHTI190498</pub-id><pub-id pub-id-type="medline">31438195</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Johnson</surname><given-names>SG</given-names> </name><name name-style="western"><surname>Speedie</surname><given-names>S</given-names> </name><name name-style="western"><surname>Simon</surname><given-names>G</given-names> </name><name name-style="western"><surname>Kumar</surname><given-names>V</given-names> </name><name name-style="western"><surname>Westra</surname><given-names>BL</given-names> </name></person-group><article-title>Quantifying the effect of data quality on the validity of an eMeasure</article-title><source>Appl Clin Inform</source><year>2017</year><month>10</month><volume>8</volume><issue>4</issue><fpage>1012</fpage><lpage>1021</lpage><pub-id pub-id-type="doi">10.4338/ACI-2017-03-RA-0042</pub-id><pub-id pub-id-type="medline">29241241</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kapsner</surname><given-names>LA</given-names> </name><name name-style="western"><surname>Kampf</surname><given-names>MO</given-names> </name><name name-style="western"><surname>Seuchter</surname><given-names>SA</given-names> </name><etal/></person-group><article-title>Moving towards an EHR data quality framework: the MIRACUM approach</article-title><source>Stud Health Technol Inform</source><year>2019</year><month>09</month><day>3</day><volume>267</volume><fpage>247</fpage><lpage>253</lpage><pub-id pub-id-type="doi">10.3233/SHTI190834</pub-id><pub-id pub-id-type="medline">31483279</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Khare</surname><given-names>R</given-names> </name><name name-style="western"><surname>Utidjian</surname><given-names>LH</given-names> </name><name name-style="western"><surname>Razzaghi</surname><given-names>H</given-names> </name><etal/></person-group><article-title>Design and refinement of a data quality assessment workflow for a large pediatric research network</article-title><source>EGEMS (Wash DC)</source><year>2019</year><month>08</month><day>1</day><volume>7</volume><issue>1</issue><fpage>36</fpage><pub-id pub-id-type="doi">10.5334/egems.294</pub-id><pub-id pub-id-type="medline">31531382</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lee</surname><given-names>K</given-names> </name><name name-style="western"><surname>Weiskopf</surname><given-names>N</given-names> </name><name name-style="western"><surname>Pathak</surname><given-names>J</given-names> </name></person-group><article-title>A framework for data quality assessment in clinical research datasets</article-title><source>AMIA Annu Symp Proc</source><year>2017</year><volume>2017</volume><fpage>1080</fpage><lpage>1089</lpage><pub-id pub-id-type="medline">29854176</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Mohamed</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Song</surname><given-names>X</given-names> </name><name name-style="western"><surname>McMahon</surname><given-names>TM</given-names> </name><etal/></person-group><article-title>Tailoring rule-based data quality assessment to the Patient-Centered Outcomes Research Network (PCORnet) Common Data Model (CDM)</article-title><source>AMIA Annu Symp Proc</source><year>2022</year><volume>2022</volume><fpage>775</fpage><lpage>784</lpage><pub-id pub-id-type="medline">37128433</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Pfaff</surname><given-names>ER</given-names> </name><name name-style="western"><surname>Girvin</surname><given-names>AT</given-names> </name><name name-style="western"><surname>Gabriel</surname><given-names>DL</given-names> </name><etal/></person-group><article-title>Synergies between centralized and federated approaches to data quality: a report from the national COVID cohort collaborative</article-title><source>J Am Med Inform Assoc</source><year>2022</year><month>03</month><day>15</day><volume>29</volume><issue>4</issue><fpage>609</fpage><lpage>618</lpage><pub-id pub-id-type="doi">10.1093/jamia/ocab217</pub-id><pub-id pub-id-type="medline">34590684</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Qualls</surname><given-names>LG</given-names> </name><name name-style="western"><surname>Phillips</surname><given-names>TA</given-names> </name><name name-style="western"><surname>Hammill</surname><given-names>BG</given-names> </name><etal/></person-group><article-title>Evaluating foundational data quality in the National Patient-Centered Clinical Research Network (PCORnet&#x00AE;)</article-title><source>EGEMS (Wash DC)</source><year>2018</year><month>04</month><day>13</day><volume>6</volume><issue>1</issue><fpage>3</fpage><pub-id pub-id-type="doi">10.5334/egems.199</pub-id><pub-id pub-id-type="medline">29881761</pub-id></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sengupta</surname><given-names>S</given-names> </name><name name-style="western"><surname>Bachman</surname><given-names>D</given-names> </name><name name-style="western"><surname>Laws</surname><given-names>R</given-names> </name><etal/></person-group><article-title>Data quality assessment and multi-organizational reporting: tools to enhance network knowledge</article-title><source>EGEMS (Wash DC)</source><year>2019</year><month>03</month><day>29</day><volume>7</volume><issue>1</issue><fpage>8</fpage><pub-id pub-id-type="doi">10.5334/egems.280</pub-id><pub-id pub-id-type="medline">30972357</pub-id></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sidky</surname><given-names>H</given-names> </name><name name-style="western"><surname>Young</surname><given-names>JC</given-names> </name><name name-style="western"><surname>Girvin</surname><given-names>AT</given-names> </name><etal/></person-group><article-title>Data quality considerations for evaluating COVID-19 treatments using real world data: learnings from the National COVID Cohort Collaborative (N3C)</article-title><source>BMC Med Res Methodol</source><year>2023</year><month>02</month><day>17</day><volume>23</volume><issue>1</issue><fpage>46</fpage><pub-id pub-id-type="doi">10.1186/s12874-023-01839-2</pub-id><pub-id pub-id-type="medline">36800930</pub-id></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Taggart</surname><given-names>J</given-names> </name><name name-style="western"><surname>Liaw</surname><given-names>ST</given-names> </name><name name-style="western"><surname>Yu</surname><given-names>H</given-names> </name></person-group><article-title>Structured data quality reports to improve EHR data quality</article-title><source>Int J Med Inform</source><year>2015</year><month>12</month><volume>84</volume><issue>12</issue><fpage>1094</fpage><lpage>1098</lpage><pub-id pub-id-type="doi">10.1016/j.ijmedinf.2015.09.008</pub-id><pub-id pub-id-type="medline">26480872</pub-id></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Weiskopf</surname><given-names>NG</given-names> </name><name name-style="western"><surname>Bakken</surname><given-names>S</given-names> </name><name name-style="western"><surname>Hripcsak</surname><given-names>G</given-names> </name><name name-style="western"><surname>Weng</surname><given-names>C</given-names> </name></person-group><article-title>A data quality assessment guideline for electronic health record data reuse</article-title><source>EGEMS (Wash DC)</source><year>2017</year><month>09</month><day>4</day><volume>5</volume><issue>1</issue><fpage>14</fpage><pub-id pub-id-type="doi">10.5334/egems.218</pub-id><pub-id pub-id-type="medline">29881734</pub-id></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Weiskopf</surname><given-names>NG</given-names> </name><name name-style="western"><surname>Weng</surname><given-names>C</given-names> </name></person-group><article-title>Methods and dimensions of electronic health record data quality assessment: enabling reuse for clinical research</article-title><source>J Am Med Inform Assoc</source><year>2013</year><month>01</month><day>1</day><volume>20</volume><issue>1</issue><fpage>144</fpage><lpage>151</lpage><pub-id pub-id-type="doi">10.1136/amiajnl-2011-000681</pub-id><pub-id pub-id-type="medline">22733976</pub-id></nlm-citation></ref><ref id="ref33"><label>33</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lewis</surname><given-names>AE</given-names> </name><name name-style="western"><surname>Weiskopf</surname><given-names>N</given-names> </name><name name-style="western"><surname>Abrams</surname><given-names>ZB</given-names> </name><etal/></person-group><article-title>Electronic health record data quality assessment and tools: a systematic review</article-title><source>J Am Med Inform Assoc</source><year>2023</year><month>09</month><day>25</day><volume>30</volume><issue>10</issue><fpage>1730</fpage><lpage>1740</lpage><pub-id pub-id-type="doi">10.1093/jamia/ocad120</pub-id><pub-id pub-id-type="medline">37390812</pub-id></nlm-citation></ref><ref id="ref34"><label>34</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ozonze</surname><given-names>O</given-names> </name><name name-style="western"><surname>Scott</surname><given-names>PJ</given-names> </name><name name-style="western"><surname>Hopgood</surname><given-names>AA</given-names> </name></person-group><article-title>Automating electronic health record data quality assessment</article-title><source>J Med Syst</source><year>2023</year><month>02</month><day>13</day><volume>47</volume><issue>1</issue><fpage>23</fpage><pub-id pub-id-type="doi">10.1007/s10916-022-01892-2</pub-id><pub-id pub-id-type="medline">36781551</pub-id></nlm-citation></ref><ref id="ref35"><label>35</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>TD</given-names> </name><name name-style="western"><surname>Henderson</surname><given-names>DW</given-names> </name><name name-style="western"><surname>Weber</surname><given-names>GM</given-names> </name><etal/></person-group><article-title>Understanding data differences across the ENACT federated research network</article-title><source>medRxiv</source><comment>Preprint posted online on  Jan 17, 2025</comment><pub-id pub-id-type="doi">10.1101/2025.01.17.25320686</pub-id><pub-id pub-id-type="medline">39867368</pub-id></nlm-citation></ref><ref id="ref36"><label>36</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Gatto</surname><given-names>NM</given-names> </name><name name-style="western"><surname>Reynolds</surname><given-names>RF</given-names> </name><name name-style="western"><surname>Campbell</surname><given-names>UB</given-names> </name></person-group><article-title>A structured preapproval and postapproval comparative study design framework to generate valid and transparent real-world evidence for regulatory decisions</article-title><source>Clin Pharmacol Ther</source><year>2019</year><month>07</month><volume>106</volume><issue>1</issue><fpage>103</fpage><lpage>115</lpage><pub-id pub-id-type="doi">10.1002/cpt.1480</pub-id><pub-id pub-id-type="medline">31025311</pub-id></nlm-citation></ref><ref id="ref37"><label>37</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Gatto</surname><given-names>NM</given-names> </name><name name-style="western"><surname>Vititoe</surname><given-names>SE</given-names> </name><name name-style="western"><surname>Rubinstein</surname><given-names>E</given-names> </name><name name-style="western"><surname>Reynolds</surname><given-names>RF</given-names> </name><name name-style="western"><surname>Campbell</surname><given-names>UB</given-names> </name></person-group><article-title>A structured process to identify fit-for-purpose study design and data to generate valid and transparent real-world evidence for regulatory uses</article-title><source>Clin Pharmacol Ther</source><year>2023</year><month>06</month><volume>113</volume><issue>6</issue><fpage>1235</fpage><lpage>1239</lpage><pub-id pub-id-type="doi">10.1002/cpt.2883</pub-id><pub-id pub-id-type="medline">36871138</pub-id></nlm-citation></ref><ref id="ref38"><label>38</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>SV</given-names> </name><name name-style="western"><surname>Pinheiro</surname><given-names>S</given-names> </name><name name-style="western"><surname>Hua</surname><given-names>W</given-names> </name><etal/></person-group><article-title>STaRT-RWE: structured template for planning and reporting on the implementation of real world evidence studies</article-title><source>BMJ</source><year>2021</year><month>01</month><day>12</day><volume>372</volume><fpage>m4856</fpage><pub-id pub-id-type="doi">10.1136/bmj.m4856</pub-id><pub-id pub-id-type="medline">33436424</pub-id></nlm-citation></ref><ref id="ref39"><label>39</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Diaz-Garelli</surname><given-names>JF</given-names> </name><name name-style="western"><surname>Bernstam</surname><given-names>EV</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>M</given-names> </name><name name-style="western"><surname>Hwang</surname><given-names>KO</given-names> </name><name name-style="western"><surname>Rahbar</surname><given-names>MH</given-names> </name><name name-style="western"><surname>Johnson</surname><given-names>TR</given-names> </name></person-group><article-title>DataGauge: a practical process for systematically designing and implementing quality assessments of repurposed clinical data</article-title><source>EGEMS (Wash DC)</source><year>2019</year><month>07</month><day>25</day><volume>7</volume><issue>1</issue><fpage>32</fpage><pub-id pub-id-type="doi">10.5334/egems.286</pub-id><pub-id pub-id-type="medline">31367649</pub-id></nlm-citation></ref><ref id="ref40"><label>40</label><nlm-citation citation-type="web"><person-group person-group-type="author"><name name-style="western"><surname>Bergquist</surname><given-names>T</given-names> </name><name name-style="western"><surname>Hassan</surname><given-names>M</given-names> </name></person-group><article-title>DQe-c-v2</article-title><source>GitHub</source><access-date>2026-04-03</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://github.com/data2health/DQe-c-v2/">https://github.com/data2health/DQe-c-v2/</ext-link></comment></nlm-citation></ref><ref id="ref41"><label>41</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kapsner</surname><given-names>LA</given-names> </name><name name-style="western"><surname>Mang</surname><given-names>JM</given-names> </name><name name-style="western"><surname>Mate</surname><given-names>S</given-names> </name><etal/></person-group><article-title>Linking a consortium-wide data quality assessment tool with the MIRACUM metadata repository</article-title><source>Appl Clin Inform</source><year>2021</year><month>08</month><volume>12</volume><issue>4</issue><fpage>826</fpage><lpage>835</lpage><pub-id pub-id-type="doi">10.1055/s-0041-1733847</pub-id><pub-id pub-id-type="medline">34433217</pub-id></nlm-citation></ref><ref id="ref42"><label>42</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Struckmann</surname><given-names>S</given-names> </name><name name-style="western"><surname>Mari&#x00F1;o</surname><given-names>J</given-names> </name><name name-style="western"><surname>Kasbohm</surname><given-names>E</given-names> </name><name name-style="western"><surname>Salogni</surname><given-names>E</given-names> </name><name name-style="western"><surname>Schmidt</surname><given-names>CO</given-names> </name></person-group><article-title>dataquieR 2: an updated R package for FAIR data quality assessments in observational studies and electronic health record data</article-title><source>J Open Source Softw</source><year>2024</year><volume>9</volume><issue>98</issue><fpage>6581</fpage><pub-id pub-id-type="doi">10.21105/joss.06581</pub-id></nlm-citation></ref><ref id="ref43"><label>43</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Rao</surname><given-names>GA</given-names> </name><name name-style="western"><surname>Shoaibi</surname><given-names>A</given-names> </name><name name-style="western"><surname>Makadia</surname><given-names>R</given-names> </name><etal/></person-group><article-title>CohortDiagnostics: phenotype evaluation across a network of observational data sources using population-level characterization</article-title><source>PLoS One</source><year>2025</year><volume>20</volume><issue>1</issue><fpage>e0310634</fpage><pub-id pub-id-type="doi">10.1371/journal.pone.0310634</pub-id><pub-id pub-id-type="medline">39820599</pub-id></nlm-citation></ref><ref id="ref44"><label>44</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Razzaghi</surname><given-names>H</given-names> </name><name name-style="western"><surname>Greenberg</surname><given-names>J</given-names> </name><name name-style="western"><surname>Bailey</surname><given-names>LC</given-names> </name></person-group><article-title>Developing a systematic approach to assessing data quality in secondary use of clinical data based on intended use</article-title><source>Learn Health Syst</source><year>2021</year><volume>6</volume><issue>1</issue><fpage>e10264</fpage><pub-id pub-id-type="doi">10.1002/lrh2.10264</pub-id><pub-id pub-id-type="medline">35036548</pub-id></nlm-citation></ref><ref id="ref45"><label>45</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kahn</surname><given-names>MG</given-names> </name><name name-style="western"><surname>Callahan</surname><given-names>TJ</given-names> </name><name name-style="western"><surname>Barnard</surname><given-names>J</given-names> </name><etal/></person-group><article-title>A harmonized data quality assessment terminology and framework for the secondary use of electronic health record data</article-title><source>EGEMS (Wash DC)</source><year>2016</year><volume>4</volume><issue>1</issue><fpage>1244</fpage><pub-id pub-id-type="doi">10.13063/2327-9214.1244</pub-id><pub-id pub-id-type="medline">27713905</pub-id></nlm-citation></ref><ref id="ref46"><label>46</label><nlm-citation citation-type="web"><person-group person-group-type="author"><name name-style="western"><surname>Razzaghi</surname><given-names>H</given-names> </name><name name-style="western"><surname>Wieand</surname><given-names>K</given-names> </name><name name-style="western"><surname>Dickinson</surname><given-names>K</given-names> </name><name name-style="western"><surname>Bailey</surname><given-names>C</given-names> </name></person-group><article-title>Data quality modules</article-title><source>PEDSpace</source><access-date>2026-03-27</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://pedsnet.org/metadata/communities/57fd85d3-0b50-4239-8f05-8db5e0e65a6a/browse/author">https://pedsnet.org/metadata/communities/57fd85d3-0b50-4239-8f05-8db5e0e65a6a/browse/author</ext-link></comment></nlm-citation></ref><ref id="ref47"><label>47</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wilkinson</surname><given-names>MD</given-names> </name><name name-style="western"><surname>Dumontier</surname><given-names>M</given-names> </name><name name-style="western"><surname>Aalbersberg</surname><given-names>IJJ</given-names> </name><etal/></person-group><article-title>The FAIR guiding principles for scientific data management and stewardship</article-title><source>Sci Data</source><year>2016</year><month>03</month><day>15</day><volume>3</volume><fpage>160018</fpage><pub-id pub-id-type="doi">10.1038/sdata.2016.18</pub-id><pub-id pub-id-type="medline">26978244</pub-id></nlm-citation></ref><ref id="ref48"><label>48</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Haux</surname><given-names>C</given-names> </name><name name-style="western"><surname>Knaup</surname><given-names>P</given-names> </name></person-group><article-title>Using FAIR metadata for secondary use of administrative claims data</article-title><source>Stud Health Technol Inform</source><year>2019</year><month>08</month><day>21</day><volume>264</volume><fpage>1472</fpage><lpage>1473</lpage><pub-id pub-id-type="doi">10.3233/SHTI190490</pub-id><pub-id pub-id-type="medline">31438187</pub-id></nlm-citation></ref></ref-list></back></article>