<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article" dtd-version="2.0">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JMIR</journal-id>
      <journal-id journal-id-type="nlm-ta">J Med Internet Res</journal-id>
      <journal-title>Journal of Medical Internet Research</journal-title>
      <issn pub-type="epub">1438-8871</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
      <article-id pub-id-type="publisher-id">v22i11e23139</article-id>
      <article-id pub-id-type="pmid">33196453</article-id>
      <article-id pub-id-type="doi">10.2196/23139</article-id>
      <article-categories>
        <subj-group subj-group-type="heading">
          <subject>Original Paper</subject>
        </subj-group>
        <subj-group subj-group-type="article-type">
          <subject>Original Paper</subject>
        </subj-group>
      </article-categories>
      <title-group>
        <article-title>Evaluating Identity Disclosure Risk in Fully Synthetic Health Data: Model Development and Validation</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="editor">
          <name>
            <surname>Eysenbach</surname>
            <given-names>Gunther</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Guness</surname>
            <given-names>Shivanand</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib id="contrib1" contrib-type="author" corresp="yes">
          <name name-style="western">
            <surname>El Emam</surname>
            <given-names>Khaled</given-names>
          </name>
          <degrees>BEng, PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <address>
            <institution>School of Epidemiology and Public Health</institution>
            <institution>Faculty of Medicine</institution>
            <institution>University of Ottawa</institution>
            <addr-line>401 Smyth Road</addr-line>
            <addr-line>Ottawa, ON, K1H 8L1</addr-line>
            <country>Canada</country>
            <phone>1 6137975412</phone>
            <email>kelemam@ehealthinformation.ca</email>
          </address>
          <xref rid="aff2" ref-type="aff">2</xref>
          <xref rid="aff3" ref-type="aff">3</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-3325-4149</ext-link>
        </contrib>
        <contrib id="contrib2" contrib-type="author">
          <name name-style="western">
            <surname>Mosquera</surname>
            <given-names>Lucy</given-names>
          </name>
          <degrees>BSc, MSc</degrees>
          <xref rid="aff3" ref-type="aff">3</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-5289-8372</ext-link>
        </contrib>
        <contrib id="contrib3" contrib-type="author">
          <name name-style="western">
            <surname>Bass</surname>
            <given-names>Jason</given-names>
          </name>
          <degrees>BSc</degrees>
          <xref rid="aff3" ref-type="aff">3</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-5848-4798</ext-link>
        </contrib>
      </contrib-group>
      <aff id="aff1">
        <label>1</label>
        <institution>School of Epidemiology and Public Health</institution>
        <institution>Faculty of Medicine</institution>
        <institution>University of Ottawa</institution>
        <addr-line>Ottawa, ON</addr-line>
        <country>Canada</country>
      </aff>
      <aff id="aff2">
        <label>2</label>
        <institution>Children's Hospital of Eastern Ontario Research Institute</institution>
        <addr-line>Ottawa, ON</addr-line>
        <country>Canada</country>
      </aff>
      <aff id="aff3">
        <label>3</label>
        <institution>Replica Analytics Ltd</institution>
        <addr-line>Ottawa, ON</addr-line>
        <country>Canada</country>
      </aff>
      <author-notes>
        <corresp>Corresponding Author: Khaled El Emam <email>kelemam@ehealthinformation.ca</email></corresp>
      </author-notes>
      <pub-date pub-type="collection">
        <month>11</month>
        <year>2020</year>
      </pub-date>
      <pub-date pub-type="epub">
        <day>16</day>
        <month>11</month>
        <year>2020</year>
      </pub-date>
      <volume>22</volume>
      <issue>11</issue>
      <elocation-id>e23139</elocation-id>
      <history>
        <date date-type="received">
          <day>2</day>
          <month>8</month>
          <year>2020</year>
        </date>
        <date date-type="rev-request">
          <day>27</day>
          <month>8</month>
          <year>2020</year>
        </date>
        <date date-type="rev-recd">
          <day>2</day>
          <month>9</month>
          <year>2020</year>
        </date>
        <date date-type="accepted">
          <day>10</day>
          <month>10</month>
          <year>2020</year>
        </date>
      </history>
      <copyright-statement>©Khaled El Emam, Lucy Mosquera, Jason Bass. Originally published in the Journal of Medical Internet Research (http://www.jmir.org), 16.11.2020.</copyright-statement>
      <copyright-year>2020</copyright-year>
      <license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
        <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (https://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in the Journal of Medical Internet Research, is properly cited. The complete bibliographic information, a link to the original publication on http://www.jmir.org/, as well as this copyright and license information must be included.</p>
      </license>
      <self-uri xlink:href="http://www.jmir.org/2020/11/e23139/" xlink:type="simple"/>
      <abstract>
        <sec sec-type="background">
          <title>Background</title>
          <p>There has been growing interest in data synthesis for enabling the sharing of data for secondary analysis; however, there is a need for a comprehensive privacy risk model for fully synthetic data: If the generative models have been overfit, then it is possible to identify individuals from synthetic data and learn something new about them.</p>
        </sec>
        <sec sec-type="objective">
          <title>Objective</title>
          <p>The purpose of this study is to develop and apply a methodology for evaluating the identity disclosure risks of fully synthetic data.</p>
        </sec>
        <sec sec-type="methods">
          <title>Methods</title>
          <p>A full risk model is presented, which evaluates both identity disclosure and the ability of an adversary to learn something new if there is a match between a synthetic record and a real person. We term this “meaningful identity disclosure risk.” The model is applied on samples from the Washington State Hospital discharge database (2007) and the Canadian COVID-19 cases database. Both of these datasets were synthesized using a sequential decision tree process commonly used to synthesize health and social science data.</p>
        </sec>
        <sec sec-type="results">
          <title>Results</title>
          <p>The meaningful identity disclosure risk for both of these synthesized samples was below the commonly used 0.09 risk threshold (0.0198 and 0.0086, respectively), and 4 times and 5 times lower than the risk values for the original datasets, respectively.</p>
        </sec>
        <sec sec-type="conclusions">
          <title>Conclusions</title>
          <p>We have presented a comprehensive identity disclosure risk model for fully synthetic data. The results for this synthesis method on 2 datasets demonstrate that synthesis can reduce meaningful identity disclosure risks considerably. The risk model can be applied in the future to evaluate the privacy of fully synthetic data.</p>
        </sec>
      </abstract>
      <kwd-group>
        <kwd>synthetic data</kwd>
        <kwd>privacy</kwd>
        <kwd>data sharing</kwd>
        <kwd>data access</kwd>
        <kwd>de-identification</kwd>
        <kwd>open data</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <sec sec-type="introduction">
      <title>Introduction</title>
      <sec>
        <title>Data Access Challenges</title>
        <p>Access to data for building and testing artificial intelligence and machine learning (AIML) models has been problematic in practice and presents a challenge for the adoption of AIML [<xref ref-type="bibr" rid="ref1">1</xref>,<xref ref-type="bibr" rid="ref2">2</xref>]. A recent analysis concluded that data access issues are ranked in the top 3 challenges faced by organizations when implementing AI [<xref ref-type="bibr" rid="ref3">3</xref>].</p>
        <p>A key obstacle to data access has been analyst concerns about privacy and meeting growing privacy obligations. For example, a recent survey by O’Reilly [<xref ref-type="bibr" rid="ref4">4</xref>] highlighted the privacy concerns of organizations adopting machine learning models, with more than half of those experienced with AIML checking for privacy issues. Specific to health care data, a National Academy of Medicine/Government Accountability Office report highlights privacy as presenting a data access barrier for the application of AI in health care [<xref ref-type="bibr" rid="ref5">5</xref>].</p>
        <p>Anonymization is one approach for addressing privacy concerns when making data available for secondary purposes such as AIML [<xref ref-type="bibr" rid="ref6">6</xref>]. However, there have been repeated claims of successful re-identification attacks on anonymized data [<xref ref-type="bibr" rid="ref7">7</xref>-<xref ref-type="bibr" rid="ref13">13</xref>], eroding public and regulator trust in this approach [<xref ref-type="bibr" rid="ref13">13</xref>-<xref ref-type="bibr" rid="ref22">22</xref>].</p>
        <p>Synthetic data generation is another approach for addressing privacy concerns that has been gaining interest recently [<xref ref-type="bibr" rid="ref23">23</xref>,<xref ref-type="bibr" rid="ref24">24</xref>]. Different generative models have been proposed, such as decision tree–based approaches [<xref ref-type="bibr" rid="ref25">25</xref>] and deep learning methods like Variational Auto Encoders [<xref ref-type="bibr" rid="ref26">26</xref>,<xref ref-type="bibr" rid="ref27">27</xref>] and Generative Adversarial Networks (GANs) [<xref ref-type="bibr" rid="ref28">28</xref>-<xref ref-type="bibr" rid="ref31">31</xref>].</p>
        <p>There are different types of privacy risks. One of them is identity disclosure [<xref ref-type="bibr" rid="ref23">23</xref>,<xref ref-type="bibr" rid="ref24">24</xref>,<xref ref-type="bibr" rid="ref32">32</xref>], which in our context means the risk of correctly mapping a synthetic record to a real person. Current identity disclosure assessment models for synthetic data have been limited in that they were formulated under the assumption of partially synthetic data [<xref ref-type="bibr" rid="ref33">33</xref>-<xref ref-type="bibr" rid="ref39">39</xref>]. Partially synthetic data permit the direct matching of synthetic records with real people because there is a one-to-one mapping between real individuals and the partially synthetic records. However, that assumption cannot be made with <italic>fully</italic> synthetic data whereby there is no direct mapping between a synthetic record and a real individual.</p>
        <p>Some researchers have argued that fully synthetic data does not have an identity disclosure risk [<xref ref-type="bibr" rid="ref29">29</xref>,<xref ref-type="bibr" rid="ref40">40</xref>-<xref ref-type="bibr" rid="ref46">46</xref>]. However, if the synthesizer is overfit to the original data, then a synthetic record can be mapped to a real person [<xref ref-type="bibr" rid="ref47">47</xref>]. Since there are degrees of overfitting, even a partial mapping may represent unacceptable privacy risk. Therefore, identity disclosure is still relevant for fully synthetic data.</p>
        <p>Another type of privacy risk is attribution risk [<xref ref-type="bibr" rid="ref42">42</xref>,<xref ref-type="bibr" rid="ref47">47</xref>], which is defined as an adversary learning that a specific individual has a certain characteristic. In this paper, we present a comprehensive privacy model that combines identity disclosure and attribution risk for fully synthetic data, where attribution is conditional on identity disclosure. This definition of privacy risk is complementary to the notion of membership disclosure as it has been operationalized in the data synthesis literature, where similarity between real and synthetic records is assessed [<xref ref-type="bibr" rid="ref28">28</xref>,<xref ref-type="bibr" rid="ref48">48</xref>]. We then demonstrate the model on health data.</p>
      </sec>
      <sec>
        <title>Background</title>
        <p>Key definitions and requirements will be presented, followed by a model for assessing identity disclosure risk. As a general rule, we have erred on the conservative side when presented with multiple design or parameter options to ensure that patient privacy would be less likely to be compromised.</p>
        <sec>
          <title>Definitions—Basic Concepts</title>
          <p>The basic scheme that we are assuming is illustrated in <xref rid="figure1" ref-type="fig">Figure 1</xref>. We have a real population denoted by the set <italic>P</italic> of size <italic>N</italic>. A real sample <italic>R</italic> exists such that <italic>R</italic>⊆<italic>P</italic>, and that is the set that we wish to create a synthetic dataset <italic>S</italic> from. Without loss of generality, the real and synthetic samples are assumed to be the same size, <italic>n</italic>.</p>
          <fig id="figure1" position="float">
            <label>Figure 1</label>
            <caption>
              <p>The relationships between the different datasets under consideration. Matching between a synthetic sample record and someone in the population goes through the real sample and can occur in 2 directions.</p>
            </caption>
            <graphic xlink:href="jmir_v22i11e23139_fig1.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </fig>
          <p>The data custodian makes the synthetic sample available for secondary purposes but does not share the generative model that is used to produce the synthetic sample. Therefore, our risk scenario is when the adversary only has access to the synthetic data.</p>
          <p>Synthetic records can be identified by matching them with individuals in the population. When matching is performed to identify synthetic records, that matching is done on the <italic>quasi-identifiers</italic>, which are a subset of the variables and are known by an adversary [<xref ref-type="bibr" rid="ref49">49</xref>]. For example, typically, a date of birth is a quasi-identifier because it is information about individuals that is known or that is relatively easy for an adversary to find out (eg, from voter registration lists [<xref ref-type="bibr" rid="ref50">50</xref>]). More generally, an adversary may know the quasi-identifiers about an individual because that individual is an acquaintance of the adversary or because the adversary has access to a population database or registry of identifiable information.</p>
          <p>The variables that are not quasi-identifiers will be referred to as <italic>sensitive variables</italic>. For example, if a dataset has information about drug use, that would be a sensitive variable that could cause harm if it was known. In general, we assume that sensitive values would cause some degree of harm if they become known to an adversary.</p>
          <p>To illustrate the privacy risks with fully synthetic data, consider the population data in <xref ref-type="table" rid="table1">Table 1</xref>. Individuals in the population are identifiable through their national IDs. We will treat the variable of one’s origin as a quasi-identifier and one’s income as the sensitive value. <xref ref-type="table" rid="table2">Table 2</xref> displays the records from the real sample, and <xref ref-type="table" rid="table3">Table 3</xref> presents records for the synthetic sample.</p>
          <p>As can be seen, there is only one North African individual and one European individual in the population, and they both are in the real sample. Therefore, these unique real sample records would match 1:1 with the population and, therefore, would have a very high risk of being identified. The population-unique European and North African records are also in the synthetic data, and thus, here we have a 1:1 match between the synthetic records and the population.</p>
          <p>The sensitive income value in the synthetic sample is very similar to the value in the real sample for the North African record. Therefore, arguably, we also learn something new about that individual. The sensitive income value is not so close for the European record, and therefore, even though we are able to match on the quasi-identifier, we will not learn meaningful information about that specific individual from synthetic data.</p>
          <table-wrap position="float" id="table1">
            <label>Table 1</label>
            <caption>
              <p>Example of a population dataset, with one’s origin as the quasi-identifier and one’s income as the sensitive variable.</p>
            </caption>
            <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
              <col width="170"/>
              <col width="410"/>
              <col width="420"/>
              <thead>
                <tr valign="top">
                  <td>National ID</td>
                  <td>Origin</td>
                  <td>Income ($)</td>
                </tr>
              </thead>
              <tbody>
                <tr valign="top">
                  <td>1</td>
                  <td>Japanese</td>
                  <td>110k</td>
                </tr>
                <tr valign="top">
                  <td>2</td>
                  <td>Japanese</td>
                  <td>100k</td>
                </tr>
                <tr valign="top">
                  <td>3</td>
                  <td>Japanese</td>
                  <td>105k</td>
                </tr>
                <tr valign="top">
                  <td>4</td>
                  <td>North African</td>
                  <td>95k</td>
                </tr>
                <tr valign="top">
                  <td>5</td>
                  <td>European</td>
                  <td>70k</td>
                </tr>
                <tr valign="top">
                  <td>6</td>
                  <td>Hispanic</td>
                  <td>100k</td>
                </tr>
                <tr valign="top">
                  <td>7</td>
                  <td>Hispanic</td>
                  <td>130k</td>
                </tr>
                <tr valign="top">
                  <td>8</td>
                  <td>Hispanic</td>
                  <td>65k</td>
                </tr>
              </tbody>
            </table>
          </table-wrap>
          <p/>
          <table-wrap position="float" id="table2">
            <label>Table 2</label>
            <caption>
              <p>Example of a real sample, with one’s origin as the quasi-identifier and one’s income as the sensitive variable.</p>
            </caption>
            <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
              <col width="500"/>
              <col width="500"/>
              <thead>
                <tr valign="top">
                  <td>Origin</td>
                  <td>Income ($)</td>
                </tr>
              </thead>
              <tbody>
                <tr valign="top">
                  <td>European</td>
                  <td>70k</td>
                </tr>
                <tr valign="top">
                  <td>Japanese</td>
                  <td>100k</td>
                </tr>
                <tr valign="top">
                  <td>Hispanic</td>
                  <td>130k</td>
                </tr>
                <tr valign="top">
                  <td>Hispanic</td>
                  <td>65k</td>
                </tr>
                <tr valign="top">
                  <td>North African</td>
                  <td>95k</td>
                </tr>
              </tbody>
            </table>
          </table-wrap>
          <p/>
          <table-wrap position="float" id="table3">
            <label>Table 3</label>
            <caption>
              <p>Example of a synthetic sample, with one’s origin as the quasi-identifier and one’s income as the sensitive variable.</p>
            </caption>
            <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
              <col width="500"/>
              <col width="500"/>
              <thead>
                <tr valign="top">
                  <td>Origin</td>
                  <td>Income ($)</td>
                </tr>
              </thead>
              <tbody>
                <tr valign="top">
                  <td>Japanese</td>
                  <td>115k</td>
                </tr>
                <tr valign="top">
                  <td>Japanese</td>
                  <td>120k</td>
                </tr>
                <tr valign="top">
                  <td>North African</td>
                  <td>100k</td>
                </tr>
                <tr valign="top">
                  <td>European</td>
                  <td>110k</td>
                </tr>
                <tr valign="top">
                  <td>Hispanic</td>
                  <td>65k</td>
                </tr>
              </tbody>
            </table>
          </table-wrap>
          <p>This example illustrates that it is plausible to match synthetic sample records with individuals in the population and thus identify these individuals, since a synthesized record can have the same value as a real record on quasi-identifiers. However, such identification is only meaningful if we learn somewhat correct sensitive information about these matched individuals. Learning something new is considered when evaluating identifiability risks in practical settings [<xref ref-type="bibr" rid="ref51">51</xref>] and is part of the definition of identity disclosure [<xref ref-type="bibr" rid="ref52">52</xref>]. Learning something new is also similar to the concept of attribution risk as it has been operationalized in the data synthesis literature [<xref ref-type="bibr" rid="ref42">42</xref>,<xref ref-type="bibr" rid="ref47">47</xref>].</p>
        </sec>
      </sec>
      <sec>
        <title>Counting Matches</title>
        <p>To formulate our model, we first need to match a synthetic sample record with a real sample record. Consider the synthetic sample in <xref ref-type="table" rid="table3">Table 3</xref> with a single quasi-identifier, one’s origin; we want to match the record with the “Hispanic” value with the real sample in <xref ref-type="table" rid="table2">Table 2</xref>. We find that there are 3 matching records in the real sample. Without any further information, we would select one of the real sample records at random, and therefore, the probability of selecting any of the records is one-third. However, there is no correct selection here. For example, we cannot say that the third record in the real sample is the correct record match, and therefore the probability of a correct match is one-third; there is no 1:1 mapping between the fully synthetic sample records and the real sample records.</p>
        <p>The key information here is that there was a match—it is a binary indicator. If there is a match between real sample record <italic>s</italic> and a synthetic sample record, we can use the indicator <italic>I<sub>s</sub></italic> (which takes on a value of 1 if there is at least one match, and 0 otherwise).</p>
      </sec>
      <sec>
        <title>Direction of Match</title>
        <p>A concept that is well understood in the disclosure control literature is that the probability of a successful match between someone in the population and a real record will depend on the direction of the match [<xref ref-type="bibr" rid="ref53">53</xref>]. A randomly selected person from the real sample will always have an equivalent record in the population. However, a randomly selected record in the population may not match someone in the real sample due to sampling. The former is referred to as a sample-to-population match, and the latter as a population-to-sample match.</p>
        <p>In our hypothetical example, an adversary may know Hans in the population and can match that with the European record in the synthetic sample through the real sample. Or the adversary may select the European record in the synthetic sample and match that with the only European in a population registry through the real sample, which happens to be Hans. Both directions of attack are plausible and will depend on whether the adversary already knows Hans as an acquaintance or not.</p>
        <p>Now we can combine the 2 types of matching to get an overall match rate between the synthetic record and the population: the synthetic sample–to–real sample match and the real sample–to–population match, and in the other direction. We will formalize this further below.</p>
      </sec>
      <sec>
        <title>Measuring Identification Risk</title>
        <p>We start off by assessing the probability that a record in the real sample can be identified by matching it with an individual in the population by an adversary. The population-to-sample attack is denoted by <italic>A</italic> and the sample-to-population attack by <italic>B</italic>.</p>
        <p>Under the assumption that an adversary will only attempt one of them, but without knowing which one, the overall probability of one of these attacks being successful is given by the maximum of both [<xref ref-type="bibr" rid="ref49">49</xref>]:</p>
        <p><bold>max(<italic>A,B</italic>)</bold> (1)</p>
        <p>The match rate for population-to-sample attacks is given by El Emam [<xref ref-type="bibr" rid="ref49">49</xref>] (using the notation in <xref ref-type="table" rid="table4">Table 4</xref>):</p>
        <graphic xlink:href="jmir_v22i11e23139_fig3.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        <p>This models an adversary who selects a random individual from the population and matches them with records in the real sample. A selected individual from the population may not be in the real sample, and therefore, the sampling does have a protective effect.</p>
        <p>Under the sample-to-population attack, the adversary randomly selects a record from the real sample and matches it to individuals in the population. The match rate is given by El Emam [<xref ref-type="bibr" rid="ref49">49</xref>]:</p>
        <graphic xlink:href="jmir_v22i11e23139_fig4.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        <p>We now extend this by accounting for the matches between the records in the synthetic sample and the records in the real sample. Only those records in the real sample that match with a record in the synthetic sample can then be matched with the population. We define an indicator variable, <italic>I<sub>s</sub></italic>=1, if a real sample record matches a synthetic sample record. Therefore, we effectively reduce the real sample to those records which match with at least 1 record in the synthetic sample. The population-to-synthetic sample identification risk can thus be expressed as</p>
        <graphic xlink:href="jmir_v22i11e23139_fig5.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        <p>And similarly, the synthetic sample-to-population identification risk can be expressed as</p>
        <graphic xlink:href="jmir_v22i11e23139_fig6.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        <p>And then we have the overall identification risk from equation (1):</p>
        <graphic xlink:href="jmir_v22i11e23139_fig7.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        <p>The population value of 1/<italic>F</italic> can be estimated using methods described in various disclosure control texts [<xref ref-type="bibr" rid="ref49">49</xref>,<xref ref-type="bibr" rid="ref54">54</xref>-<xref ref-type="bibr" rid="ref59">59</xref>].</p>
        <table-wrap position="float" id="table4">
          <label>Table 4</label>
          <caption>
            <p>Notation used in this paper.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="150"/>
            <col width="850"/>
            <thead>
              <tr valign="top">
                <td>Notation</td>
                <td>Interpretation</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>
                  <italic>s</italic>
                </td>
                <td>An index to count records in the real sample</td>
              </tr>
              <tr valign="top">
                <td>
                  <italic>t</italic>
                </td>
                <td>An index to count records in the synthetic sample</td>
              </tr>
              <tr valign="top">
                <td>
                  <italic>N</italic>
                </td>
                <td>The number of records in the true population</td>
              </tr>
              <tr valign="top">
                <td>
                  <italic>f<sub>s</sub></italic>
                </td>
                <td>The equivalence class group size in the real sample for a particular record <italic>s</italic> in the real sample. The equivalence class is defined as the set of records with the same values on the quasi-identifiers.</td>
              </tr>
              <tr valign="top">
                <td>
                  <italic>F<sub>s</sub></italic>
                </td>
                <td>The equivalence group size in the population that has the same quasi-identifier values as record <italic>s</italic> in the real sample. The equivalence class is defined as the set of records with the same values on the quasi-identifiers.</td>
              </tr>
              <tr valign="top">
                <td>
                  <italic>n</italic>
                </td>
                <td>The number of records in the (real or synthetic) sample</td>
              </tr>
              <tr valign="top">
                <td>
                  <italic>I<sub>s</sub></italic>
                </td>
                <td>A binary indicator of whether record <italic>s</italic> in the real sample matches a record in the synthetic sample</td>
              </tr>
              <tr valign="top">
                <td>
                  <italic>R<sub>s</sub></italic>
                </td>
                <td>A binary indicator of whether the adversary would learn something new if record <italic>s</italic> in the real sample matches a record in the synthetic sample</td>
              </tr>
              <tr valign="top">
                <td>
                  <italic>k</italic>
                </td>
                <td>Number of quasi-identifiers</td>
              </tr>
              <tr valign="top">
                <td>
                  <sub>λ</sub>
                </td>
                <td>Adjustment to account for errors in matching and a verification rate that is not perfect</td>
              </tr>
              <tr valign="top">
                <td>
                  <italic>L</italic>
                </td>
                <td>The minimal percentage of sensitive variables that need to be similar between the real sample and synthetic sample to consider that an adversary has learned something new</td>
              </tr>
            </tbody>
          </table>
        </table-wrap>
      </sec>
      <sec>
        <title>Adjusting for Incorrect Matches</title>
        <p>In practice, 2 adjustments should be made to equation (6) to take into account the reality of matching when attempting to identify records [<xref ref-type="bibr" rid="ref60">60</xref>]: data errors and the likelihood of verification. The overall probability can be expressed as:</p>
        <p><italic>pr(a)pr(b</italic>&#124;<italic>a)pr(c</italic>&#124;<italic>a,b)</italic></p>
        <p><italic>pr(a)</italic> is the probability that there are no errors in the data, <italic>pr(b</italic>&#124;<italic>a)</italic> is the probability of a match given that there are no errors in the data, and <italic>pr(c</italic>&#124;<italic>a,b)</italic> is the probability that the match can be verified given that there are no errors in the data and that the records match.</p>
        <p>Real data has errors in it, and therefore, the accuracy of the matching based on adversary knowledge will be reduced [<xref ref-type="bibr" rid="ref53">53</xref>,<xref ref-type="bibr" rid="ref61">61</xref>]. Known data error rates not specific to health data (eg, voter registration databases, surveys, and data from data brokers) can be relatively large [<xref ref-type="bibr" rid="ref62">62</xref>-<xref ref-type="bibr" rid="ref65">65</xref>]. For health data, the error rates have tended to be lower [<xref ref-type="bibr" rid="ref66">66</xref>-<xref ref-type="bibr" rid="ref70">70</xref>], with a weighted mean of 4.26%. Therefore, the probability of at least one variable having an error in it is given by 1–(1–0.0426)<sup>k</sup>, where <italic>k</italic> is the number of quasi-identifiers. If we assume that the adversary has perfect information and only the data will have an error in it, then the probability of no data errors is <italic>pr(a)</italic>=(1–0.0426)<sup>k</sup>.</p>
        <p>A previous review of identification attempts found that when there is a suspected match between a record and a real individual, the suspected match could only be verified 23% of the time [<xref ref-type="bibr" rid="ref71">71</xref>], <italic>pr(c</italic>&#124;<italic>a,b)</italic>=0.23. This means that a large proportion of suspected matches turn out to be false positives when the adversary attempts to verify them. A good example from a published re-identification attack illustrating this is when the adversary was unable to contact the individuals to verify the matches in the time allotted for the study [<xref ref-type="bibr" rid="ref11">11</xref>] (there are potentially multiple reasons for this, such as people moved, died, or their contact information was incorrect), which was 23%. It means that even though there is a suspected match, verifying it is not certain, and without verification, it would not be known whether the match was correct. In some of these studies, the verification ability is confounded with other factors, and therefore, there is uncertainty around this 23% value.</p>
        <p>We can now adjust equation (6) with the λ parameter:</p>
        <p>
          <bold>λ=0.23×(1–0.0426)<sup>k</sup> (8)</bold>
        </p>
        <p>However, equation (8) does not account for the uncertainty in the values obtained from the literature and assumes that verification rates and error rates are independent. Specifically, when there are data errors, they would make the ability to verify less likely, which makes these 2 effects correlated. We can model this correlation, as explained below.</p>
        <p>The verification rate and data error rate can be represented as triangular distributions, which is a common way to model phenomena for risk assessment where the real distribution is not precisely known [<xref ref-type="bibr" rid="ref72">72</xref>]. The means of the distributions are the values noted above, and the minimum and maximum values for each of the triangular distributions were taken from the literature (cited above).</p>
        <p>We can also model the correlation between the 2 distributions to capture the dependency between (lack of) data errors and verification. This correlation was assumed to be medium, according to Cohen guidelines for the interpretation of effect sizes [<xref ref-type="bibr" rid="ref73">73</xref>]. We can then sample from these 2 triangular distributions inducing a medium correlation [<xref ref-type="bibr" rid="ref74">74</xref>]. The 2 sampled values can be entered into equation (8) instead of the mean values, and we get a new value, λ<sub>s</sub>, based on the sampled values. We draw from the correlated triangular distributions for every record in the real sample.</p>
        <p>We can use the λ<sub>s</sub> value directly in our model. However, to err on the conservative side and avoid this adjustment for data errors and verification over-attenuating the actual risk, we use instead the midpoint between λ<sub>s</sub> and the maximum value of 1. We define</p>
        <graphic xlink:href="jmir_v22i11e23139_fig8.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        <p>This more conservative adjustment can be entered into equation (6) as follows:</p>
        <graphic xlink:href="jmir_v22i11e23139_fig9.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
      </sec>
      <sec>
        <title>Learning Something New</title>
        <p>We now extend the risk model in equation (10) to determine if the adversary would learn something new from a match. We let <italic>R<sub>s</sub></italic> be a binary indicator of whether the adversary could learn something new:</p>
        <graphic xlink:href="jmir_v22i11e23139_fig10.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        <p>Because a real sample record can match multiple synthetic sample records, the <italic>R<sub>s</sub></italic> is equal to 1 if any of the matches meets the “learning something new” threshold.</p>
        <p>In practice, we compute <italic>I<sub>s</sub></italic> first, and if that is 0, then there is no point in computing the remaining terms for that <italic>s</italic> record: we only consider those records that have a match between the real and synthetic samples since the “learning something new” test would not be applicable where there is no match.</p>
        <p>Learning something new in the context of synthetic data can be expressed as a function of the sensitive variables. Also note that for our analysis, we assume that each sensitive variable is at the same level of granularity as in the real sample since that is the information that the adversary will have after a match.</p>
        <p>The test of whether an adversary learns something new is defined in terms of 2 criteria: (1) Is the individual’s real information different from other individuals in the real sample (ie, to what extent is that individual an outlier in the real sample)? And (2) to what extent is the synthetic sample value similar to the real sample value? Both of these conditions would be tested for every sensitive variable.</p>
        <p>Let us suppose that the sensitive variable we are looking at is the cost of a procedure. Consider the following scenarios: If the real information about an individual is very similar to other individuals (eg, the value is the same as the mean), then the information gain from an identification would be low (note that there is still some information gain, but it would be lower than the other scenarios). However, if the information about an individual is quite different, say the cost of the procedure is 3 times higher than the mean, then the information gain could be relatively high because that value is unusual. If the synthetic sample cost is quite similar to the real sample cost, then the information gain is still higher because the adversary would learn more accurate information. However, if the synthetic sample cost is quite different from the real sample cost, then very little would be learned by the adversary, or what will be learned will be incorrect, and therefore, the correct information gain would be low.</p>
        <p>This set of scenarios is summarized in <xref rid="figure2" ref-type="fig">Figure 2</xref>. Only 1 quadrant (top right) would then represent a high and correct information gain, and the objective of our analysis is to determine whether a matched individual is in that quadrant for at least <italic>L</italic>% of its sensitive variables. A reasonable value of <italic>L</italic> would need to be specified for a particular analysis.</p>
        <fig id="figure2" position="float">
          <label>Figure 2</label>
          <caption>
            <p>The relationship between a real observation to the rest of the data in the real sample and to the synthetic observation, which can be used to determine the likelihood of meaningful identity disclosure.</p>
          </caption>
          <graphic xlink:href="jmir_v22i11e23139_fig2.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <p>We propose a model to assess what the adversary would learn from each sensitive variable. If the adversary learns something new for at least <italic>L%</italic> of the sensitive variable, then we set <italic>R</italic><sub>2</sub>=1; otherwise, it is 0.</p>
      </sec>
      <sec>
        <title>Nominal and Binary Sensitive Variables</title>
        <p>We start off with nominal/binary sensitive variables and then extend the model to continuous variables. Let <italic>X<sub>s</sub></italic> be the sensitive variable for real record <italic>s</italic> under consideration, and let <italic>J</italic> be the set of different values that <italic>X<sub>s</sub></italic> can take in the real sample. Assume the matching record has value <italic>X<sub>s</sub>=j</italic> where <italic>j∈J</italic>, and that <italic>p</italic><sub>j</sub> is the proportion of records in the real sample that have the same <italic>j</italic> value.</p>
        <p>We can then determine the distance that the <italic>X<sub>s</sub></italic> value has from the rest of the real sample data as follows:</p>
        <p>
          <bold><italic>d<sub>j</sub>=1–p<sub>j</sub></italic> (12)</bold>
        </p>
        <p>The distance is low if the value <italic>j</italic> is very common, and it is large if the value of <italic>j</italic> is very different than the rest of the real sample dataset.</p>
        <p>Let the matching record on the sensitive variable in the synthetic record be denoted by <italic>Y<sub>t</sub>=z</italic>, where <italic>z∈Z</italic> and <italic>Z</italic> is the set of possible values that <italic>Y</italic><sub>t</sub> can take in the synthetic sample; in practice, <italic>Z⊆J</italic>. For any 2 records that match from the real sample and the synthetic sample, we compare their values. The measure of how similar the real value is to the rest of the distribution when it matches is therefore given by d<sub>j</sub>×[X<sub>s</sub>=Y<sub>t</sub>], where the square brackets are Iverson brackets.</p>
        <p>How do we know if that value indicates that the adversary learns something new about the patient?</p>
        <p>We set a conservative threshold; if the similarity is larger than 1 standard deviation, assuming that taking on value <italic>j</italic> follows a Bernoulli distribution, we then have the inequality for nominal and binary variables that must be met to declare that an adversary will learn something new from a matched sensitive variable.</p>
        <graphic xlink:href="jmir_v22i11e23139_fig11.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        <p>The inequality compares the weighted value with the standard deviation of the proportion <italic>p<sub>j</sub></italic>.</p>
      </sec>
      <sec>
        <title>Continuous Sensitive Variables</title>
        <p>Continuous sensitive variables should be discretized using univariate k-means clustering, with optimal cluster sizes chosen by the majority rule [<xref ref-type="bibr" rid="ref75">75</xref>]. Again, let <italic>X</italic> be the sensitive variable under consideration, and <italic>X<sub>s</sub></italic> be the value of that variable for the real record under consideration. We define the cluster's size in the real sample with the value of the sensitive variable that belongs to the matched real record under consideration as <italic>C<sub>s</sub></italic>. For example, if the sensitive variable is the cost of a procedure and it is $150, and if that specific value is in a cluster of size 5, then <italic>C<sub>s</sub>=5</italic>. The proportion of all patients that are in this cluster compared to all patients in the real sample is given by <italic>p<sub>s</sub></italic>.</p>
        <p>In the same manner as for nominal and binary variables, the distance is defined as</p>
        <p>
          <bold><italic>d<sub>s</sub>=p<sub>s</sub></italic> (14)</bold>
        </p>
        <p>Let <italic>Y<sub>t</sub></italic> be the synthetic value on the continuous sensitive variable that matched with real record<italic>s</italic>. The weighted absolute difference expresses how much information the adversary has learned, d<sub>s</sub>×&#124;<italic>X<sub>s</sub>-Y<sub>t</sub></italic>&#124;.</p>
        <p>We need to determine if this value signifies learning too much. We compare this value to the median absolute deviation (MAD) over the <italic>X</italic> variable. The MAD is a robust measure of variation. We define the inequality:</p>
        <p>
          <bold>d<sub>s</sub>×&#124;<italic>X<sub>s</sub>–Y<sub>t</sub></italic>&#124;&#60;1.48×<italic>MAD</italic> (15)</bold>
        </p>
        <p>When this inequality is met, then the weighted difference between the real and synthetic values on the sensitive variable for a particular patient indicates that the adversary will indeed learn something new.</p>
        <p>The 1.48 value makes the MAD equivalent to 1 standard deviation for Gaussian distributions. Of course, the multiplier for MAD can be adjusted since the choice of a single standard deviation equivalent was a subjective (albeit conservative) decision.</p>
      </sec>
      <sec>
        <title>Comprehensive Evaluation of Attacks</title>
        <p>An adversary may not attempt to identify records on their original values but instead generalize the values in the synthetic sample and match those. The adversary may also attempt to identify records on a subset of the quasi-identifiers. Therefore, it is necessary to evaluate generalized values on the quasi-identifiers and subsets of quasi-identifiers during the matching process.</p>
        <p>In <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>, we describe how we perform a comprehensive search for these attack modalities by considering all generalizations and all subsets, and then we take the highest risk across all combinations of generalization and quasi-identifier subsets as the overall meaningful identity disclosure risk of the dataset.</p>
      </sec>
    </sec>
    <sec sec-type="methods">
      <title>Methods</title>
      <p>We describe the methods used to apply this meaningful identity disclosure risk assessment model on 2 datasets.</p>
      <sec>
        <title>Datasets Evaluated</title>
        <p>We apply the meaningful identity disclosure measurement methodology on 2 datasets. The first is the Washington State Inpatient Database (SID) for 2007. This is a dataset covering population hospital discharges for the year. The dataset has 206 variables and 644,902 observations. The second is the Canadian COVID-19 case dataset with 7 variables and 100,220 records gathered by Esri Canada [<xref ref-type="bibr" rid="ref76">76</xref>].</p>
        <p>We selected a 10% random sample from the full SID and synthesized it (64,490 patients). Then, meaningful identity disclosure of that subset was evaluated using the methodology described in this paper. The whole population dataset was used to compute the population parameters in equation (5) required for calculating the identity disclosure risk values according to equation (11). This ensured that there were no sources of estimation error that needed to be accounted for.</p>
        <p>The COVID-19 dataset has 7 variables, with the date of reporting, health region, province, age group, gender, case status (active, recovered, deceased, and unknown), and type of exposure. A 20% sample was taken from the COVID-19 dataset (20,045 records), and the population was used to compute the meaningful identity disclosure risk similar to the Washington SID dataset.</p>
      </sec>
      <sec>
        <title>Quasi-identifiers</title>
        <p>State inpatient databases have been attacked in the past, and therefore, we know the quasi-identifiers that have been useful to an adversary. One attack was performed on the Washington SID [<xref ref-type="bibr" rid="ref11">11</xref>], and a subsequent one on the Maine and Vermont datasets [<xref ref-type="bibr" rid="ref10">10</xref>]. The quasi-identifiers that were used in these attacks and that are included in the Washington SID are shown in <xref ref-type="table" rid="table5">Table 5</xref>.</p>
        <table-wrap position="float" id="table5">
          <label>Table 5</label>
          <caption>
            <p>Quasi-identifiers included in the analysis of the Washington State Inpatient Database (SID) dataset.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="210"/>
            <col width="790"/>
            <thead>
              <tr valign="top">
                <td>Variable</td>
                <td>Definition</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>AGE</td>
                <td>patient's age in years at the time of admission</td>
              </tr>
              <tr valign="top">
                <td>AGEDAY</td>
                <td>age in days of a patient under 1 year of age</td>
              </tr>
              <tr valign="top">
                <td>AGEMONTH</td>
                <td>age in months for patients under 11 years of age</td>
              </tr>
              <tr valign="top">
                <td>PSTCO2</td>
                <td>patient's state/county federal information processing standard (FIPS) code</td>
              </tr>
              <tr valign="top">
                <td>ZIP</td>
                <td>patient's zip code</td>
              </tr>
              <tr valign="top">
                <td>FEMALE</td>
                <td>sex of the patient</td>
              </tr>
              <tr valign="top">
                <td>AYEAR</td>
                <td>hospital admission year</td>
              </tr>
              <tr valign="top">
                <td>AMONTH</td>
                <td>admission month</td>
              </tr>
              <tr valign="top">
                <td>AWEEKEND</td>
                <td>admission date was on a weekend</td>
              </tr>
            </tbody>
          </table>
        </table-wrap>
        <p>For the COVID-19 dataset, all of the variables, except exposure, would be considered quasi-identifiers since they would be knowable about an individual.</p>
      </sec>
      <sec>
        <title>Data Synthesis Method</title>
        <p>For data synthesis, we used classification and regression trees [<xref ref-type="bibr" rid="ref77">77</xref>], which have been proposed for sequential data synthesis [<xref ref-type="bibr" rid="ref78">78</xref>] using a scheme similar to sequential imputation [<xref ref-type="bibr" rid="ref79">79</xref>,<xref ref-type="bibr" rid="ref80">80</xref>]. Trees are used quite extensively for the synthesis of health and social sciences data [<xref ref-type="bibr" rid="ref34">34</xref>,<xref ref-type="bibr" rid="ref81">81</xref>-<xref ref-type="bibr" rid="ref88">88</xref>]. With these types of models, a variable is synthesized by using the values earlier in the sequence as predictors.</p>
        <p>The specific method we used to generate synthetic data is called conditional trees [<xref ref-type="bibr" rid="ref89">89</xref>], although other tree algorithms could also be used. A summary of the algorithm is provided in <xref ref-type="boxed-text" rid="box1">Textbox 1</xref>. When a fitted model is used to generate data, we sample from the predicted terminal node in the tree to get the synthetic values.</p>
        <boxed-text id="box1" position="float">
          <title>Description of the sequential synthesis algorithm.</title>
          <p>Let us say that we have 5 variables, A, B, C, D, and E. The generation is performed sequentially, and therefore, we need to have a sequence. Various criteria can be used to choose a sequence. For our example, we define the sequence as A→E→C→B→D.</p>
          <p>Let the prime notation indicate that the variable is synthesized. For example, A’ means that this is the synthesized version of A. The following are the steps for sequential generation:</p>
          <list list-type="bullet">
            <list-item>
              <p>Sample from the A distribution to get A’</p>
            </list-item>
            <list-item>
              <p>Build a model F1: E ∼ A</p>
            </list-item>
            <list-item>
              <p>Synthesize E as E’ = F1(A’)</p>
            </list-item>
            <list-item>
              <p>Build a model F2: C ∼ A + E</p>
            </list-item>
            <list-item>
              <p>Synthesize C as C’ = F2(A’, E’)</p>
            </list-item>
            <list-item>
              <p>Build a model F3: B ∼ A + E + C</p>
            </list-item>
            <list-item>
              <p>Synthesize B as B’ = F3(A’, E’, C’)</p>
            </list-item>
            <list-item>
              <p>Build a model F4: D ∼ A + E + C + B</p>
            </list-item>
            <list-item>
              <p>Synthesize D as D’ = F4(A’, E’, C’, B’)</p>
            </list-item>
          </list>
          <p>The process can be thought of as having 2 steps, fitting and synthesis. Initially, we are fitting a series of models (F1, F2, F3, F4). These models make up the generator. Then these models can be used to synthesize data according to the scheme illustrated above.</p>
        </boxed-text>
      </sec>
      <sec>
        <title>Risk Assessment Parameters</title>
        <p>As well as computing the meaningful identity disclosure risk for the synthetic sample, we computed the meaningful identity disclosure risk for the real sample itself. With the latter, we let the real sample play the role of the synthetic sample, which means we are comparing the real sample against itself. This should set a baseline to compare the risk values on the synthetic data and allows us to assess the reduction in meaningful identity disclosure risk due to data synthesis. Note that both of the datasets we used in this empirical study were already de-identified to some extent.</p>
        <p>For the computation of meaningful identity disclosure risk, we used an acceptable risk threshold value of 0.09 to be consistent with values proposed by large data custodians and have been suggested by the European Medicines Agency and Health Canada for the public release of clinical trial data (<xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). We also set <italic>L</italic>=5%.</p>
      </sec>
      <sec>
        <title>Ethics</title>
        <p>This study was approved by the CHEO Research Institute Research Ethics Board, protocol numbers 20/31X and 20/73X.</p>
      </sec>
    </sec>
    <sec sec-type="results">
      <title>Results</title>
      <p>The meaningful identity disclosure risk assessment results according to equation (11) for the Washington hospital discharge data are shown in <xref ref-type="table" rid="table6">Table 6</xref>. We can see that the overall meaningful identity disclosure risk for the synthetic data is significantly lower than the threshold of 0.09. We compare this to the real data, where the overall reduction in risk due to synthesis is approximately 5 times. The synthetic data is 4.5 times below the threshold.</p>
      <p>The risk result on the real dataset is consistent with the empirical attack results [<xref ref-type="bibr" rid="ref11">11</xref>]: An attempt to match 81 individuals resulted in verified, correct matches of 8 individuals, which is a risk level of 0.099 and is more or less the same as the value that was calculated using the current methodology. The real data risk was higher than the threshold, and therefore, by this standard, the original dataset would be considered to have an unacceptably high risk of identifying individuals.</p>
      <p>The results for the synthetic Canadian COVID-19 case data are also below the threshold by about 10 times, and 4 times below risk values for the real data, although the original data has a risk value that is also below the threshold.</p>
      <p>However, it is clear that the synthetic datasets demonstrate a significant reduction in meaningful identity disclosure risk compared to the original real dataset.</p>
      <table-wrap position="float" id="table6">
        <label>Table 6</label>
        <caption>
          <p>Overall meaningful identity disclosure risk results. (The italicized values are the maximum risk values.)</p>
        </caption>
        <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
          <col width="200"/>
          <col width="200"/>
          <col width="200"/>
          <col width="200"/>
          <col width="200"/>
          <thead>
            <tr valign="top">
              <td>Parameter</td>
              <td colspan="2">Synthetic data risk</td>
              <td colspan="2">Real data risk</td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>Population-to-sample risk</td>
              <td>Sample-to-population risk</td>
              <td>Population-to-sample risk</td>
              <td>Sample-to-population risk</td>
            </tr>
          </thead>
          <tbody>
            <tr valign="top">
              <td>Washington State Inpatient Database</td>
              <td>0.00056</td>
              <td>
                <italic>0.0197</italic>
              </td>
              <td>0.016</td>
              <td>
                <italic>0.098</italic>
              </td>
            </tr>
            <tr valign="top">
              <td>Canadian COVID-19 cases</td>
              <td>0.0043</td>
              <td>
                <italic>0.0086</italic>
              </td>
              <td>0.012</td>
              <td>
                <italic>0.034</italic>
              </td>
            </tr>
          </tbody>
        </table>
      </table-wrap>
    </sec>
    <sec sec-type="discussion">
      <title>Discussion</title>
      <sec>
        <title>Summary</title>
        <p>The objective of this study was to develop and empirically test a methodology for the evaluation of identity disclosure risks for fully synthetic health data. This methodology builds on previous work on attribution risk for synthetic data to provide a comprehensive risk evaluation. It was then applied to a synthetic version of the Washington hospital discharge database and the Canadian COVID-19 cases dataset.</p>
        <p>We found that the meaningful identity disclosure risk was below the commonly used risk threshold of 0.09 between 4.5 times and 10 times. Note that this reduced risk level was achieved without implementing any security and privacy controls on the dataset, suggesting that the synthetic variant can be shared with limited controls in place. The synthetic data also had a lower risk than the original data by between 4 and 5 times.</p>
        <p>These results are encouraging in that they provide strong empirical evidence to claims in the literature that the identity disclosure risks from fully synthetic data are low. Further tests and case studies are needed to add more weight to these findings and determine if they are generalizable to other types of datasets.</p>
      </sec>
      <sec>
        <title>Contributions of this Research</title>
        <p>This work extends, in important ways, previous privacy models for fully synthetic data. Let <italic>R’<sub>s</sub></italic> be an arbitrary indicator of whether an adversary learns something new about a real sample record <italic>s</italic>. An earlier privacy risk model [<xref ref-type="bibr" rid="ref42">42</xref>,<xref ref-type="bibr" rid="ref47">47</xref>] focused on attribution risk was defined as:</p>
        <graphic xlink:href="jmir_v22i11e23139_fig12.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        <p>This is similar to our definition of learning something new conditional on identity disclosure. Our model extends this work by also considering the likelihood of matching the real sample record to the population using both directions of attack, including a comprehensive search for possible matches between the real sample and synthetic sample. We also consider data errors and verification probabilities in our model, and our implementation of <italic>R’<sub>s</sub></italic> allows for uncertainty in the matching beyond equality tests.</p>
        <p>Some previous data synthesis studies examined another type of disclosure: membership disclosure [<xref ref-type="bibr" rid="ref28">28</xref>,<xref ref-type="bibr" rid="ref48">48</xref>]. The assessment of meaningful identity disclosure, as described in this paper, does not preclude the evaluation of membership disclosure when generating synthetic data, and in fact, both approaches can be considered as complementary ways to examine privacy risks in synthetic data.</p>
        <p>Privacy risk measures that assume that an adversary has white-box or black-box access to the generative model [<xref ref-type="bibr" rid="ref29">29</xref>] are not applicable to our scenario, as our assumption has been that only the synthetic data is shared and the original data custodian retains the generative model.</p>
      </sec>
      <sec>
        <title>Applications in Practice</title>
        <p>Meaningful identity disclosure evaluations should be performed on a regular basis on synthetic data to ensure that the generative models do not overfit. This can complement membership disclosure assessments, providing 2 ways of performing a broad evaluation of privacy risks in synthetic data.</p>
        <p>With our model, it is also possible to include meaningful identity disclosure risk as part of the loss function in generative models to simultaneously optimize on identity disclosure risk as well as data utility, and to manage overfitting during synthesis since a signal of overfitting would be a high meaningful identity disclosure risk.</p>
      </sec>
      <sec>
        <title>Limitations</title>
        <p>The overall risk assessment model is agnostic to the synthesis approach that is used; however, our empirical results are limited to using a sequential decision tree method for data synthesis. While this is a commonly used approach for health and social science data, different approaches may yield different risk values when evaluated using the methodology described here.</p>
        <p>We also made the worst-case assumption that the adversary knowledge is perfect and is not subject to data errors. This is a conservative assumption but was made because we do not have data or evidence on adversary background knowledge errors.</p>
        <p>Future work should extend this model to longitudinal datasets, as the current risk model is limited to cross-sectional data.</p>
      </sec>
    </sec>
  </body>
  <back>
    <app-group>
      <supplementary-material id="app1">
        <label>Multimedia Appendix 1</label>
        <p>Details of calculating and interpreting identity disclosure risk values.</p>
        <media xlink:href="jmir_v22i11e23139_app1.pdf" xlink:title="PDF File  (Adobe PDF File), 818 KB"/>
      </supplementary-material>
    </app-group>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">AIML</term>
          <def>
            <p>artificial intelligence and machine learning</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb2">MAD</term>
          <def>
            <p>median absolute deviation</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb3">SID</term>
          <def>
            <p>State Inpatient Database</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <ack>
      <p>We wish to thank Yangdi Jiang for reviewing an earlier version of this paper.</p>
    </ack>
    <fn-group>
      <fn fn-type="conflict">
        <p>This work was performed in collaboration with Replica Analytics Ltd. This company is a spin-off from the Children’s Hospital of Eastern Ontario Research Institute. KEE is co-founder and has equity in this company. LM and JB are data scientists / software engineers employed by Replica Analytics Ltd.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <collab>Government Accountability Office</collab>
          </person-group>
          <article-title>Artificial Intelligence: Emerging opportunities, challenges, and implications for policy and research</article-title>
          <source>U.S. GAO</source>
          <year>2018</year>
          <month>06</month>
          <access-date>2019-07-09</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.gao.gov/assets/700/692793.pdf">https://www.gao.gov/assets/700/692793.pdf</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <collab>McKinsey Global Institute</collab>
          </person-group>
          <article-title>Artificial Intelligence: The next digital frontier?</article-title>
          <source>McKinsey Analytics</source>
          <year>2017</year>
          <month>06</month>
          <access-date>2019-07-09</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.mckinsey.com/~/media/mckinsey/industries/advanced%20electronics/our%20insights/how%20artificial%20intelligence%20can%20deliver%20real%20value%20to%20companies/mgi-artificial-intelligence-discussion-paper.ashx">https://www.mckinsey.com/~/media/mckinsey/industries/advanced%20electronics/our%20insights/how%20artificial%20intelligence%20can%20deliver%20real%20value%20to%20companies/mgi-artificial-intelligence-discussion-paper.ashx</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <collab>Deloitte</collab>
          </person-group>
          <article-title>State of AI in the Enterprise, 2nd Edition</article-title>
          <source>Deloitte Insights</source>
          <year>2018</year>
          <access-date>2019-07-11</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www2.deloitte.com/content/dam/insights/us/articles/4780_State-of-AI-in-the-enterprise/DI_State-of-AI-in-the-enterprise-2nd-ed.pdf">https://www2.deloitte.com/content/dam/insights/us/articles/4780_State-of-AI-in-the-enterprise/DI_State-of-AI-in-the-enterprise-2nd-ed.pdf</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lorica</surname>
              <given-names>Ben</given-names>
            </name>
            <name name-style="western">
              <surname>Nathan</surname>
              <given-names>Paco</given-names>
            </name>
          </person-group>
          <source>The State of Machine Learning Adoption in the Enterprise</source>
          <year>2018</year>
          <publisher-loc>Sebastopol, CA</publisher-loc>
          <publisher-name>O'Reilly</publisher-name>
        </nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <collab>Government Accountability Office</collab>
            <collab>National Academy of Medicine</collab>
          </person-group>
          <article-title>Artificial Intelligence in Health Care: Benefits and Challenges of Machine Learning in Drug Development (Technology Assessment)</article-title>
          <source>U.S. GAO</source>
          <year>2019</year>
          <month>12</month>
          <access-date>2020-01-29</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.gao.gov/assets/710/703558.pdf">https://www.gao.gov/assets/710/703558.pdf</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref6">
        <label>6</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <collab>Information Commissoner's Office</collab>
          </person-group>
          <article-title>Anonymisation: Managing Data Protection Risk Code of Practice</article-title>
          <source>ICO</source>
          <year>2012</year>
          <access-date>2020-01-20</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://ico.org.uk/media/1061/anonymisation-code.pdf">https://ico.org.uk/media/1061/anonymisation-code.pdf</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref7">
        <label>7</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>de Montjoye</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Hidalgo</surname>
              <given-names>CA</given-names>
            </name>
            <name name-style="western">
              <surname>Verleysen</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Blondel</surname>
              <given-names>VD</given-names>
            </name>
          </person-group>
          <article-title>Unique in the Crowd: The privacy bounds of human mobility</article-title>
          <source>Sci Rep</source>
          <year>2013</year>
          <month>3</month>
          <volume>3</volume>
          <fpage>1376</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://dx.doi.org/10.1038/srep01376"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/srep01376</pub-id>
          <pub-id pub-id-type="medline">23524645</pub-id>
          <pub-id pub-id-type="pii">srep01376</pub-id>
          <pub-id pub-id-type="pmcid">PMC3607247</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref8">
        <label>8</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>de</surname>
              <given-names>MY</given-names>
            </name>
            <name name-style="western">
              <surname>Radaelli</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Singh</surname>
              <given-names>VK</given-names>
            </name>
            <name name-style="western">
              <surname>Pentland</surname>
              <given-names>AS</given-names>
            </name>
          </person-group>
          <article-title>Identity and privacy. Unique in the shopping mall: on the reidentifiability of credit card metadata</article-title>
          <source>Science</source>
          <year>2015</year>
          <month>01</month>
          <day>30</day>
          <volume>347</volume>
          <issue>6221</issue>
          <fpage>536</fpage>
          <lpage>9</lpage>
          <pub-id pub-id-type="doi">10.1126/science.1256297</pub-id>
          <pub-id pub-id-type="medline">25635097</pub-id>
          <pub-id pub-id-type="pii">347/6221/536</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref9">
        <label>9</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Sweeney</surname>
              <given-names>Latanya</given-names>
            </name>
            <name name-style="western">
              <surname>Yoo</surname>
              <given-names>Ji Su</given-names>
            </name>
            <name name-style="western">
              <surname>Perovich</surname>
              <given-names>Laura</given-names>
            </name>
            <name name-style="western">
              <surname>Boronow</surname>
              <given-names>Katherine E</given-names>
            </name>
            <name name-style="western">
              <surname>Brown</surname>
              <given-names>Phil</given-names>
            </name>
            <name name-style="western">
              <surname>Brody</surname>
              <given-names>Julia Green</given-names>
            </name>
          </person-group>
          <article-title>Re-identification Risks in HIPAA Safe Harbor Data: A study of data from one environmental health study</article-title>
          <source>Technol Sci</source>
          <year>2017</year>
          <fpage>2017082801</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/30687852"/>
          </comment>
          <pub-id pub-id-type="medline">30687852</pub-id>
          <pub-id pub-id-type="pmcid">PMC6344041</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref10">
        <label>10</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Su Yoo</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Thaler</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Sweeney</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Zang</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Risks to Patient Privacy: A Re-identification of Patients in Maine and Vermont Statewide Hospital Data</article-title>
          <source>Technology Science</source>
          <year>2018</year>
          <month>10</month>
          <day>08</day>
          <fpage>2018100901</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://techscience.org/a/2018100901/"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref11">
        <label>11</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Sweeney</surname>
              <given-names>L</given-names>
            </name>
          </person-group>
          <article-title>Matching Known Patients to Health Records in Washington State Data</article-title>
          <source>SSRN Journal</source>
          <year>2015</year>
          <month>07</month>
          <day>05</day>
          <fpage>1</fpage>
          <lpage>13</lpage>
          <pub-id pub-id-type="doi">10.2139/ssrn.2289850</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref12">
        <label>12</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Sweeney</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>von Loewenfeldt</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Perry</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Saying it’s Anonymous Doesn't Make It So: Re-identifications of “anonymized” law school data</article-title>
          <source>Technology Science</source>
          <year>2018</year>
          <month>11</month>
          <day>12</day>
          <fpage>2018111301</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://techscience.org/a/2018111301/"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref13">
        <label>13</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>13</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Imperiled information: Students find website data leaks pose greater risks than most people realize</article-title>
          <source>Harvard John A. Paulson School of Engineering and Applied Sciences</source>
          <year>2020</year>
          <month>01</month>
          <day>17</day>
          <access-date>2020-03-23</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.seas.harvard.edu/news/2020/01/imperiled-information">https://www.seas.harvard.edu/news/2020/01/imperiled-information</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref14">
        <label>14</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Bode</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>Researchers Find "Anonymized" Data Is Even Less Anonymous Than We Thought</article-title>
          <source>Motherboard: Tech by Vice</source>
          <year>2020</year>
          <month>02</month>
          <day>03</day>
          <access-date>2020-05-11</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.vice.com/en_ca/article/dygy8k/researchers-find-anonymized-data-is-even-less-anonymous-than-we-thought">https://www.vice.com/en_ca/article/dygy8k/researchers-find-anonymized-data-is-even-less-anonymous-than-we-thought</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref15">
        <label>15</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Clemons</surname>
              <given-names>E</given-names>
            </name>
          </person-group>
          <article-title>Online Profiling and Invasion of Privacy: The Myth of Anonymization</article-title>
          <source>HuffPost</source>
          <year>2013</year>
          <month>02</month>
          <day>20</day>
          <access-date>2020-05-11</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.huffpost.com/entry/internet-targeted-ads_b_2712586">https://www.huffpost.com/entry/internet-targeted-ads_b_2712586</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref16">
        <label>16</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Jee</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>You're very easy to track down, even when your data has been anonymized</article-title>
          <source>MIT Technology Review</source>
          <year>2019</year>
          <month>07</month>
          <day>23</day>
          <access-date>2020-05-11</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.technologyreview.com/2019/07/23/134090/youre-very-easy-to-track-down-even-when-your-data-has-been-anonymized/">https://www.technologyreview.com/2019/07/23/134090/youre-very-easy-to-track-down-even-when-your-data-has-been-anonymized/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref17">
        <label>17</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kolata</surname>
              <given-names>G</given-names>
            </name>
          </person-group>
          <article-title>Your Data Were "Anonymized"? These Scientists Can Still Identify You</article-title>
          <source>The New York Times</source>
          <year>2019</year>
          <month>07</month>
          <day>23</day>
          <access-date>2020-05-05</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.nytimes.com/2019/07/23/health/data-privacy-protection.html">https://www.nytimes.com/2019/07/23/health/data-privacy-protection.html</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref18">
        <label>18</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lomas</surname>
              <given-names>N</given-names>
            </name>
          </person-group>
          <article-title>Researchers spotlight the lie of "anonymous" data</article-title>
          <source>TechCrunch</source>
          <year>2019</year>
          <month>07</month>
          <day>24</day>
          <access-date>2020-05-11</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://techcrunch.com/2019/07/24/researchers-spotlight-the-lie-of-anonymous-data/">https://techcrunch.com/2019/07/24/researchers-spotlight-the-lie-of-anonymous-data/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref19">
        <label>19</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Mitchell</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Study finds HIPAA protected data still at risks</article-title>
          <source>Harvard Gazette</source>
          <year>2019</year>
          <month>03</month>
          <day>08</day>
          <access-date>2020-05-11</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://news.harvard.edu/gazette/story/newsplus/study-finds-hipaa-protected-data-still-at-risks/">https://news.harvard.edu/gazette/story/newsplus/study-finds-hipaa-protected-data-still-at-risks/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref20">
        <label>20</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Thompson</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Warzel</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>Twelve Million Phones, One Dataset, Zero Privacy</article-title>
          <source>The New York Times</source>
          <year>2019</year>
          <month>12</month>
          <day>19</day>
          <access-date>2020-05-11</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.nytimes.com/interactive/2019/12/19/opinion/location-tracking-cell-phone.html">https://www.nytimes.com/interactive/2019/12/19/opinion/location-tracking-cell-phone.html</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref21">
        <label>21</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hern</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>'Anonymised' data can never be totally anonymous, says study</article-title>
          <source>The Guardian</source>
          <year>2019</year>
          <month>07</month>
          <day>23</day>
          <access-date>2020-05-05</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.theguardian.com/technology/2019/jul/23/anonymised-data-never-be-anonymous-enough-study-finds">https://www.theguardian.com/technology/2019/jul/23/anonymised-data-never-be-anonymous-enough-study-finds</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref22">
        <label>22</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ghafur</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Van Dael</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Leis</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Darzi</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Sheikh</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Public perceptions on data sharing: key insights from the UK and the USA</article-title>
          <source>The Lancet Digital Health</source>
          <year>2020</year>
          <month>09</month>
          <volume>2</volume>
          <issue>9</issue>
          <fpage>e444</fpage>
          <lpage>e446</lpage>
          <pub-id pub-id-type="doi">10.1016/s2589-7500(20)30161-8</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref23">
        <label>23</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>El Emam</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Hoptroff</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>The Synthetic Data Paradigm for Using and Sharing Data</article-title>
          <source>Cutter Executive Update</source>
          <year>2019</year>
          <month>05</month>
          <day>06</day>
          <access-date>2020-05-06</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.cutter.com/article/synthetic-data-paradigm-using-and-sharing-data-503526">https://www.cutter.com/article/synthetic-data-paradigm-using-and-sharing-data-503526</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref24">
        <label>24</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>El Emam</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Mosquera</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Hoptroff</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <source>Practical Synthetic Data Generation: Balancing Privacy and the Broad Availability of Data</source>
          <year>2020</year>
          <month>05</month>
          <publisher-loc>Sebastopol, CA</publisher-loc>
          <publisher-name>O'Reilly Media, Inc</publisher-name>
        </nlm-citation>
      </ref>
      <ref id="ref25">
        <label>25</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Reiter</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Using CART to generate partially synthetic, public use microdata</article-title>
          <source>Journal of Official Statistics</source>
          <year>2005</year>
          <volume>21</volume>
          <issue>3</issue>
          <fpage>441</fpage>
          <lpage>462</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.scb.se/contentassets/ca21efb41fee47d293bbee5bf7be7fb3/using-cart-to-generate-partially-synthetic-public-use-microdata.pdf"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref26">
        <label>26</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wan</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>He</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>Variational autoencoder based synthetic data generation for imbalanced learning</article-title>
          <year>2017</year>
          <conf-name>IEEE Symposium Series on Computational Intelligence (SSCI)</conf-name>
          <conf-date>November 27 - December 1</conf-date>
          <conf-loc>Honolulu, Hawaii</conf-loc>
          <pub-id pub-id-type="doi">10.1109/ssci.2017.8285168</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref27">
        <label>27</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <collab>Gootjes-Dreesbach, L</collab>
            <name name-style="western">
              <surname>Sood</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Sahay</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Hofmann-Apitius</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Variational Autoencoder Modular Bayesian Networks (VAMBN) for Simulation of Heterogeneous Clinical Study Data</article-title>
          <source>bioRxiv</source>
          <year>2019</year>
          <access-date>2020-01-06</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.biorxiv.org/content/biorxiv/early/2019/09/08/760744.full.pdf">https://www.biorxiv.org/content/biorxiv/early/2019/09/08/760744.full.pdf</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref28">
        <label>28</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Yan</surname>
              <given-names>C</given-names>
            </name>
            <collab>Mesa</collab>
            <collab>DA</collab>
            <name name-style="western">
              <surname>Sun</surname>
              <given-names>J</given-names>
            </name>
            <collab>Malin</collab>
            <collab>BA</collab>
          </person-group>
          <article-title>Ensuring electronic medical record simulation through better training, modeling, and evaluation</article-title>
          <source>J Am Med Inform Assoc</source>
          <year>2019</year>
          <fpage>99</fpage>
          <lpage>108</lpage>
          <pub-id pub-id-type="doi">10.1093/jamia/ocz161</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref29">
        <label>29</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Park</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Mohammadi</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Gorde</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Jajodia</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Park</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>Y</given-names>
            </name>
          </person-group>
          <article-title>Data synthesis based on generative adversarial networks</article-title>
          <source>Proc. VLDB Endow</source>
          <year>2018</year>
          <month>06</month>
          <day>01</day>
          <volume>11</volume>
          <issue>10</issue>
          <fpage>1071</fpage>
          <lpage>1083</lpage>
          <pub-id pub-id-type="doi">10.14778/3231751.3231757</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref30">
        <label>30</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Chin-Cheong</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Sutter</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Vogt</surname>
              <given-names>JE</given-names>
            </name>
          </person-group>
          <article-title>Generation of Heterogeneous Synthetic Electronic Health Records using GANs</article-title>
          <year>2019</year>
          <conf-name>Workshop on Machine Learning for Health (ML4H) at the 33rd Conference on Neural Information Processing Systems (NeurIPS )</conf-name>
          <conf-date>December 13</conf-date>
          <conf-loc>Vancouver</conf-loc>
          <pub-id pub-id-type="doi">10.3929/ethz-b-000392473</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref31">
        <label>31</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Choi</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Biswal</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Malin</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Duke</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Stewart</surname>
              <given-names>WF</given-names>
            </name>
            <name name-style="western">
              <surname>Sun</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Generating Multi-label Discrete Patient Records using Generative Adversarial Networks</article-title>
          <source>arXiv</source>
          <year>2017</year>
          <access-date>2020-05-11</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://arxiv.org/abs/1703.06490">http://arxiv.org/abs/1703.06490</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref32">
        <label>32</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>El Emam</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Alvarez</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>A critical appraisal of the Article 29 Working Party Opinion 05/2014 on data anonymization techniques</article-title>
          <source>International Data Privacy Law</source>
          <year>2014</year>
          <month>12</month>
          <day>13</day>
          <volume>5</volume>
          <issue>1</issue>
          <fpage>73</fpage>
          <lpage>87</lpage>
          <pub-id pub-id-type="doi">10.1093/idpl/ipu033</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref33">
        <label>33</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Drechsler</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Reiter</surname>
              <given-names>JP</given-names>
            </name>
          </person-group>
          <article-title>Accounting for Intruder Uncertainty Due to Sampling When Estimating Identification Disclosure Risks in Partially Synthetic Data</article-title>
          <source>Privacy in Statistical Databases. Lecture Notes in Computer Science, vol 5262</source>
          <year>2008</year>
          <publisher-loc>Berlin</publisher-loc>
          <publisher-name>Springer</publisher-name>
          <fpage>227</fpage>
          <lpage>238</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref34">
        <label>34</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Drechsler</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Reiter</surname>
              <given-names>JP</given-names>
            </name>
          </person-group>
          <article-title>An empirical evaluation of easily implemented, nonparametric methods for generating synthetic datasets</article-title>
          <year>2011</year>
          <month>12</month>
          <volume>55</volume>
          <issue>12</issue>
          <fpage>3232</fpage>
          <lpage>3243</lpage>
          <pub-id pub-id-type="doi">10.1016/j.csda.2011.06.006</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref35">
        <label>35</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Reiter</surname>
              <given-names>JP</given-names>
            </name>
            <name name-style="western">
              <surname>Mitra</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>Estimating Risks of Identification Disclosure in Partially Synthetic Data</article-title>
          <source>JPC</source>
          <year>2009</year>
          <month>04</month>
          <day>01</day>
          <volume>1</volume>
          <issue>1</issue>
          <fpage>1</fpage>
          <lpage>1</lpage>
          <pub-id pub-id-type="doi">10.29012/jpc.v1i1.567</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref36">
        <label>36</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Dandekar</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Zen</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Bressan</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>A comparative study of synthetic dataset generation techniques (TRA6/18)</article-title>
          <source>National University of Singapore, School of Computing</source>
          <year>2018</year>
          <access-date>2020-07-09</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://dl.comp.nus.edu.sg/bitstream/handle/1900.100/7050/TRA6-18.pdf?sequence=1&#38;isAllowed=y">https://dl.comp.nus.edu.sg/bitstream/handle/1900.100/7050/TRA6-18.pdf?sequence=1&#38;isAllowed=y</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref37">
        <label>37</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Loong</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Zaslavsky</surname>
              <given-names>AM</given-names>
            </name>
            <name name-style="western">
              <surname>He</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Harrington</surname>
              <given-names>DP</given-names>
            </name>
          </person-group>
          <article-title>Disclosure control using partially synthetic data for large-scale health surveys, with applications to CanCORS</article-title>
          <source>Statist. Med</source>
          <year>2013</year>
          <month>05</month>
          <day>13</day>
          <volume>32</volume>
          <issue>24</issue>
          <fpage>4139</fpage>
          <lpage>4161</lpage>
          <pub-id pub-id-type="doi">10.1002/sim.5841</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref38">
        <label>38</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Drechsler</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Reiter</surname>
              <given-names>JP</given-names>
            </name>
          </person-group>
          <article-title>Disclosure Risk and Data Utility for Partially Synthetic Data: An Empirical Study Using the German IAB Establishment Survey</article-title>
          <source>Journal of Official Statistics</source>
          <year>2008</year>
          <volume>25</volume>
          <issue>4</issue>
          <fpage>589</fpage>
          <lpage>603</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.scb.se/contentassets/ca21efb41fee47d293bbee5bf7be7fb3/disclosure-risk-and-data-utility-for-partially-synthetic-data-an-empirical-study-using-the-german-iab-establishment-survey.pdf"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref39">
        <label>39</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Drechsler</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Bender</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Rässler</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Comparing Fully and Partially Synthetic Datasets for Statistical Disclosure Control in the German IAB Establishment Panel</article-title>
          <source>Trans. Data Privacy</source>
          <year>2008</year>
          <volume>1</volume>
          <issue>3</issue>
          <fpage>105</fpage>
          <lpage>130</lpage>
          <pub-id pub-id-type="doi">10.1007/978-1-4614-0326-5_7</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref40">
        <label>40</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Reiter</surname>
              <given-names>JP</given-names>
            </name>
          </person-group>
          <article-title>New Approaches to Data Dissemination: A Glimpse into the Future (?)</article-title>
          <source>CHANCE</source>
          <year>2012</year>
          <month>09</month>
          <day>20</day>
          <volume>17</volume>
          <issue>3</issue>
          <fpage>11</fpage>
          <lpage>15</lpage>
          <pub-id pub-id-type="doi">10.1080/09332480.2004.10554907</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref41">
        <label>41</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hu</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Bayesian Estimation of Attribute and Identification Disclosure Risks in Synthetic Data</article-title>
          <source>arXiv</source>
          <year>2018</year>
          <access-date>2020-03-15</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://arxiv.org/abs/1804.02784">http://arxiv.org/abs/1804.02784</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref42">
        <label>42</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Taub</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Elliot</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Pampaka</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Smith</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>Differential Correct Attribution Probability for Synthetic Data: An Exploration</article-title>
          <source>Privacy in Statistical Databases. Lecture Notes in Computer Science, vol 11126</source>
          <year>2018</year>
          <publisher-loc>Cham</publisher-loc>
          <publisher-name>Springer</publisher-name>
          <fpage>122</fpage>
          <lpage>137</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref43">
        <label>43</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hu</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Reiter</surname>
              <given-names>JP</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>Q</given-names>
            </name>
          </person-group>
          <person-group person-group-type="editor">
            <name name-style="western">
              <surname>Domingo-Ferrer</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Disclosure Risk Evaluation for Fully Synthetic Categorical Data</article-title>
          <source>Privacy in Statistical Databases. Lecture Notes in Computer Science, vol 8744</source>
          <year>2014</year>
          <publisher-loc>Cham</publisher-loc>
          <publisher-name>Springer</publisher-name>
          <fpage>185</fpage>
          <lpage>199</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref44">
        <label>44</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wei</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Reiter</surname>
              <given-names>JP</given-names>
            </name>
          </person-group>
          <article-title>Releasing synthetic magnitude microdata constrained to fixed marginal totals</article-title>
          <source>SJI</source>
          <year>2016</year>
          <month>02</month>
          <day>27</day>
          <volume>32</volume>
          <issue>1</issue>
          <fpage>93</fpage>
          <lpage>108</lpage>
          <pub-id pub-id-type="doi">10.3233/sji-160959</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref45">
        <label>45</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ruiz</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Muralidhar</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Domingo-Ferrer</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <person-group person-group-type="editor">
            <name name-style="western">
              <surname>Domingo-Ferrer</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Montes</surname>
              <given-names>F</given-names>
            </name>
          </person-group>
          <article-title>On the Privacy Guarantees of Synthetic Data: A Reassessment from the Maximum-Knowledge Attacker Perspective</article-title>
          <source>Privacy in Statistical Databases. Lecture Notes in Computer Science, vol 11126</source>
          <year>2018</year>
          <publisher-loc>Cham</publisher-loc>
          <publisher-name>Springer</publisher-name>
          <fpage>59</fpage>
          <lpage>74</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref46">
        <label>46</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Reiter</surname>
              <given-names>JP</given-names>
            </name>
          </person-group>
          <article-title>Releasing multiply imputed, synthetic public use microdata: an illustration and empirical study</article-title>
          <source>J Royal Statistical Soc A</source>
          <year>2005</year>
          <month>01</month>
          <volume>168</volume>
          <issue>1</issue>
          <fpage>185</fpage>
          <lpage>205</lpage>
          <pub-id pub-id-type="doi">10.1111/j.1467-985x.2004.00343.x</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref47">
        <label>47</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Elliot</surname>
              <given-names>Mark</given-names>
            </name>
          </person-group>
          <article-title>Final Report on the Disclosure Risk Associated with the Synthetic Data produced by the SYLLS Team</article-title>
          <source>Manchester University</source>
          <year>2014</year>
          <month>10</month>
          <access-date>2020-05-11</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://hummedia.manchester.ac.uk/institutes/cmist/archive-publications/reports/2015-02%20-Report%20on%20disclosure%20risk%20analysis%20of%20synthpop%20synthetic%20versions%20of%20LCF_%20final.pdf">https://hummedia.manchester.ac.uk/institutes/cmist/archive-publications/reports/2015-02%20-Report%20on%20disclosure%20risk%20analysis%20of%20synthpop%20synthetic%20versions%20of%20LCF_%20final.pdf</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref48">
        <label>48</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Yan</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Nyemba</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Malin</surname>
              <given-names>B</given-names>
            </name>
          </person-group>
          <article-title>Generating Electronic Health Records with Multiple Data Types and Constraints</article-title>
          <source>arXiv</source>
          <year>2020</year>
          <month>03</month>
          <access-date>2020-06-15</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://arxiv.org/abs/2003.07904">http://arxiv.org/abs/2003.07904</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref49">
        <label>49</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>El Emam</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <source>Guide to the De-Identification of Personal Health Information</source>
          <year>2013</year>
          <publisher-loc>Boca Raton</publisher-loc>
          <publisher-name>CRC Press (Auerbach)</publisher-name>
        </nlm-citation>
      </ref>
      <ref id="ref50">
        <label>50</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Benitez</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Malin</surname>
              <given-names>B</given-names>
            </name>
          </person-group>
          <article-title>Evaluating re-identification risks with respect to the HIPAA privacy rule</article-title>
          <source>J Am Med Inform Assoc</source>
          <year>2010</year>
          <volume>17</volume>
          <issue>2</issue>
          <fpage>169</fpage>
          <lpage>77</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/20190059"/>
          </comment>
          <pub-id pub-id-type="doi">10.1136/jamia.2009.000026</pub-id>
          <pub-id pub-id-type="medline">20190059</pub-id>
          <pub-id pub-id-type="pii">17/2/169</pub-id>
          <pub-id pub-id-type="pmcid">PMC3000773</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref51">
        <label>51</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wilkinson</surname>
              <given-names>Krista</given-names>
            </name>
            <name name-style="western">
              <surname>Green</surname>
              <given-names>Christopher</given-names>
            </name>
            <name name-style="western">
              <surname>Nowicki</surname>
              <given-names>Deborah</given-names>
            </name>
            <name name-style="western">
              <surname>Von Schindler</surname>
              <given-names>Christina</given-names>
            </name>
          </person-group>
          <article-title>Less than five is less than ideal: replacing the "less than 5 cell size" rule with a risk-based data disclosure protocol in a public health setting</article-title>
          <source>Can J Public Health</source>
          <year>2020</year>
          <month>10</month>
          <volume>111</volume>
          <issue>5</issue>
          <fpage>761</fpage>
          <lpage>765</lpage>
          <pub-id pub-id-type="doi">10.17269/s41997-020-00303-8</pub-id>
          <pub-id pub-id-type="medline">32162281</pub-id>
          <pub-id pub-id-type="pii">10.17269/s41997-020-00303-8</pub-id>
          <pub-id pub-id-type="pmcid">PMC7501321</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref52">
        <label>52</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Skinner</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>On identification disclosure and prediction disclosure for microdata</article-title>
          <source>Statistica Neerland</source>
          <year>1992</year>
          <month>03</month>
          <volume>46</volume>
          <issue>1</issue>
          <fpage>21</fpage>
          <lpage>32</lpage>
          <pub-id pub-id-type="doi">10.1111/j.1467-9574.1992.tb01324.x</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref53">
        <label>53</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Elliot</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Dale</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Scenarios of Attack: The Data Intruders Perspective on Statistical Disclosure Risk</article-title>
          <source>Netherlands Official Statistics</source>
          <year>1999</year>
          <volume>14</volume>
          <fpage>6</fpage>
          <lpage>10</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref54">
        <label>54</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hundepool</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Domingo-Ferrer</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Franconi</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Giessing</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Schulte Nordholt</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Spicer</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>De Wolf</surname>
              <given-names>P</given-names>
            </name>
          </person-group>
          <source>Statistical Disclosure Control</source>
          <year>2012</year>
          <publisher-loc>Chichester</publisher-loc>
          <publisher-name>Wiley</publisher-name>
        </nlm-citation>
      </ref>
      <ref id="ref55">
        <label>55</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hundepool</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Domingo-Ferrer</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Franconi</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Giessing</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Lenz</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Naylor</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Schulte Nordholt</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Seri</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>de Wolf</surname>
              <given-names>P</given-names>
            </name>
          </person-group>
          <article-title>Handbook on Statistical Disclosure Control</article-title>
          <source>ESSNet</source>
          <year>2010</year>
          <access-date>2020-01-11</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://ec.europa.eu/eurostat/cros/system/files/SDC_Handbook.pdf">https://ec.europa.eu/eurostat/cros/system/files/SDC_Handbook.pdf</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref56">
        <label>56</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Duncan</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Elliot</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Salazar</surname>
              <given-names>G</given-names>
            </name>
          </person-group>
          <source>tatistical Confidentiality - Principles and Practice</source>
          <year>2011</year>
          <publisher-loc>New York</publisher-loc>
          <publisher-name>Springer-Verlag</publisher-name>
        </nlm-citation>
      </ref>
      <ref id="ref57">
        <label>57</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Templ</surname>
              <given-names>Matthias</given-names>
            </name>
          </person-group>
          <source>Statistical Disclosure Control for Microdata</source>
          <year>2017</year>
          <publisher-loc>Cham</publisher-loc>
          <publisher-name>Springer International Publishing</publisher-name>
        </nlm-citation>
      </ref>
      <ref id="ref58">
        <label>58</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Willenborg</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>de Waal</surname>
              <given-names>T</given-names>
            </name>
          </person-group>
          <source>Statistical Disclosure Control in Practice</source>
          <year>1996</year>
          <publisher-loc>New York</publisher-loc>
          <publisher-name>Springer-Verlag</publisher-name>
        </nlm-citation>
      </ref>
      <ref id="ref59">
        <label>59</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Willenborg</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>de Waal</surname>
              <given-names>T</given-names>
            </name>
          </person-group>
          <source>Elements of Statistical Disclosure Control</source>
          <year>2001</year>
          <publisher-loc>New York</publisher-loc>
          <publisher-name>Springer-Verlag</publisher-name>
        </nlm-citation>
      </ref>
      <ref id="ref60">
        <label>60</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Marsh</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Skinner</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Arber</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Penhale</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Openshaw</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Hobcraft</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Lievesley</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Walford</surname>
              <given-names>N</given-names>
            </name>
          </person-group>
          <article-title>The Case for Samples of Anonymized Records from the 1991 Census</article-title>
          <source>Journal of the Royal Statistical Society. Series A (Statistics in Society)</source>
          <year>1991</year>
          <volume>154</volume>
          <issue>2</issue>
          <fpage>305</fpage>
          <pub-id pub-id-type="doi">10.2307/2983043</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref61">
        <label>61</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Blien</surname>
              <given-names>U</given-names>
            </name>
            <name name-style="western">
              <surname>Wirth</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Muller</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Disclosure risk for microdata stemming from official statistics</article-title>
          <source>Statistica Neerland</source>
          <year>1992</year>
          <month>03</month>
          <volume>46</volume>
          <issue>1</issue>
          <fpage>69</fpage>
          <lpage>82</lpage>
          <pub-id pub-id-type="doi">10.1111/j.1467-9574.1992.tb01327.x</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref62">
        <label>62</label>
        <nlm-citation citation-type="web">
          <article-title>Inaccurate, Costly, and Inefficient: Evidence That America's Voter Registration System Needs an Upgrade</article-title>
          <source>The Pew Charitable Trusts</source>
          <year>2012</year>
          <access-date>2020-12-15</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://pew.org/2yHGTSf">http://pew.org/2yHGTSf</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref63">
        <label>63</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <collab>Rainie, L</collab>
            <name name-style="western">
              <surname>Kiesler</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Kang</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Madden</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Anonymity, Privacy, and Security Online</article-title>
          <year>2013</year>
          <access-date>2019-12-03</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.pewresearch.org/internet/2013/09/05/anonymity-privacy-and-security-online/">https://www.pewresearch.org/internet/2013/09/05/anonymity-privacy-and-security-online/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref64">
        <label>64</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <collab>Leetaru, K</collab>
          </person-group>
          <article-title>The Data Brokers So Powerful Even Facebook Bought Their Data - But They Got Me Wildly Wrong</article-title>
          <source>Forbes</source>
          <year>2018</year>
          <month>04</month>
          <day>05</day>
          <access-date>2019-12-03</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.forbes.com/sites/kalevleetaru/2018/04/05/the-data-brokers-so-powerful-even-facebook-bought-their-data-but-they-got-me-wildly-wrong/">https://www.forbes.com/sites/kalevleetaru/2018/04/05/the-data-brokers-so-powerful-even-facebook-bought-their-data-but-they-got-me-wildly-wrong/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref65">
        <label>65</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Venkatadri</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Sapiezynski</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Redmiles</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Mislove</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Goga</surname>
              <given-names>O</given-names>
            </name>
            <name name-style="western">
              <surname>Mazurek</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Gummadi</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>Auditing Offline Data Brokersvia Facebook’s Advertising Platform</article-title>
          <year>2019</year>
          <conf-name>The World Wide Web Conference</conf-name>
          <conf-date>May 13-17</conf-date>
          <conf-loc>San Francisco</conf-loc>
          <pub-id pub-id-type="doi">10.1145/3308558.3313666</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref66">
        <label>66</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Goldberg</surname>
              <given-names>Saveli I</given-names>
            </name>
            <name name-style="western">
              <surname>Niemierko</surname>
              <given-names>Andrzej</given-names>
            </name>
            <name name-style="western">
              <surname>Turchin</surname>
              <given-names>Alexander</given-names>
            </name>
          </person-group>
          <article-title>Analysis of data errors in clinical research databases</article-title>
          <source>AMIA Annu Symp Proc</source>
          <year>2008</year>
          <month>11</month>
          <day>06</day>
          <fpage>242</fpage>
          <lpage>6</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/18998889"/>
          </comment>
          <pub-id pub-id-type="medline">18998889</pub-id>
          <pub-id pub-id-type="pmcid">PMC2656002</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref67">
        <label>67</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hong</surname>
              <given-names>MKH</given-names>
            </name>
            <name name-style="western">
              <surname>Yao</surname>
              <given-names>HHI</given-names>
            </name>
            <name name-style="western">
              <surname>Pedersen</surname>
              <given-names>JS</given-names>
            </name>
            <name name-style="western">
              <surname>Peters</surname>
              <given-names>JS</given-names>
            </name>
            <name name-style="western">
              <surname>Costello</surname>
              <given-names>AJ</given-names>
            </name>
            <name name-style="western">
              <surname>Murphy</surname>
              <given-names>DG</given-names>
            </name>
            <name name-style="western">
              <surname>Hovens</surname>
              <given-names>CM</given-names>
            </name>
            <name name-style="western">
              <surname>Corcoran</surname>
              <given-names>NM</given-names>
            </name>
          </person-group>
          <article-title>Error rates in a clinical data repository: lessons from the transition to electronic data transfer—a descriptive study</article-title>
          <source>BMJ Open</source>
          <year>2013</year>
          <month>05</month>
          <day>17</day>
          <volume>3</volume>
          <issue>5</issue>
          <fpage>e002406</fpage>
          <pub-id pub-id-type="doi">10.1136/bmjopen-2012-002406</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref68">
        <label>68</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Mitchel</surname>
              <given-names>JT</given-names>
            </name>
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>YJ</given-names>
            </name>
            <name name-style="western">
              <surname>Choi</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Park</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Cappi</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Horn</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Kist</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>D'agostino</surname>
              <given-names>RB</given-names>
            </name>
          </person-group>
          <article-title>Evaluation of Data Entry Errors and Data Changes to an Electronic Data Capture Clinical Trial Database</article-title>
          <source>Drug Information Journal</source>
          <year>2011</year>
          <month>07</month>
          <volume>45</volume>
          <issue>4</issue>
          <fpage>421</fpage>
          <lpage>430</lpage>
          <pub-id pub-id-type="doi">10.1177/009286151104500404</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref69">
        <label>69</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wahi</surname>
              <given-names>MM</given-names>
            </name>
            <name name-style="western">
              <surname>Parks</surname>
              <given-names>DV</given-names>
            </name>
            <name name-style="western">
              <surname>Skeate</surname>
              <given-names>RC</given-names>
            </name>
            <name name-style="western">
              <surname>Goldin</surname>
              <given-names>SB</given-names>
            </name>
          </person-group>
          <article-title>Reducing Errors from the Electronic Transcription of Data Collected on Paper Forms: A Research Data Case Study</article-title>
          <source>Journal of the American Medical Informatics Association</source>
          <year>2008</year>
          <month>05</month>
          <day>01</day>
          <volume>15</volume>
          <issue>3</issue>
          <fpage>386</fpage>
          <lpage>389</lpage>
          <pub-id pub-id-type="doi">10.1197/jamia.m2381</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref70">
        <label>70</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Nahm</surname>
              <given-names>ML</given-names>
            </name>
            <name name-style="western">
              <surname>Pieper</surname>
              <given-names>CF</given-names>
            </name>
            <name name-style="western">
              <surname>Cunningham</surname>
              <given-names>MM</given-names>
            </name>
          </person-group>
          <article-title>Quantifying Data Quality for Clinical Trials Using Electronic Data Capture</article-title>
          <source>PLoS ONE</source>
          <year>2008</year>
          <month>8</month>
          <day>25</day>
          <volume>3</volume>
          <issue>8</issue>
          <fpage>e3049</fpage>
          <pub-id pub-id-type="doi">10.1371/journal.pone.0003049</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref71">
        <label>71</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Branson</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Good</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Monge</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Probst</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>El Emam</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>Evaluating the re-identification risk of a clinical study report anonymized under EMA Policy 0070 and Health Canada Regulations</article-title>
          <source>Trials</source>
          <year>2020</year>
          <month>2</month>
          <day>18</day>
          <volume>21</volume>
          <issue>1</issue>
          <fpage>1</fpage>
          <lpage>1</lpage>
          <pub-id pub-id-type="doi">10.1186/s13063-020-4120-y</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref72">
        <label>72</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Vose</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <source>Risk Analysis: A Quantitative Guide, 3rd ed</source>
          <year>2008</year>
          <publisher-loc>Chichester</publisher-loc>
          <publisher-name>Wiley</publisher-name>
        </nlm-citation>
      </ref>
      <ref id="ref73">
        <label>73</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Cohen</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <source>Statistical power analysis for the behavioral sciences</source>
          <year>1988</year>
          <publisher-loc>Hillsdale, N.J</publisher-loc>
          <publisher-name>Lawrence Erlbaum Associates</publisher-name>
        </nlm-citation>
      </ref>
      <ref id="ref74">
        <label>74</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Iman</surname>
              <given-names>RL</given-names>
            </name>
            <name name-style="western">
              <surname>Conover</surname>
              <given-names>WJ</given-names>
            </name>
          </person-group>
          <article-title>A distribution-free approach to inducing rank correlation among input variables</article-title>
          <source>Communications in Statistics - Simulation and Computation</source>
          <year>2007</year>
          <month>06</month>
          <day>27</day>
          <volume>11</volume>
          <issue>3</issue>
          <fpage>311</fpage>
          <lpage>334</lpage>
          <pub-id pub-id-type="doi">10.1080/03610918208812265</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref75">
        <label>75</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Charrad</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Ghazzali</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Boiteau</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Niknafs</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>: An                                        Package for Determining the Relevant Number of Clusters in a Data Set</article-title>
          <source>J. Stat. Soft</source>
          <year>2014</year>
          <volume>61</volume>
          <issue>6</issue>
          <fpage>1</fpage>
          <lpage>1</lpage>
          <pub-id pub-id-type="doi">10.18637/jss.v061.i06</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref76">
        <label>76</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <collab>Esri Canada</collab>
          </person-group>
          <article-title>Covid-19 Resources</article-title>
          <source>Covid-19 Canada</source>
          <access-date>2020-10-15</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://resources-covid19canada.hub.arcgis.com/">https://resources-covid19canada.hub.arcgis.com/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref77">
        <label>77</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Gordon</surname>
              <given-names>AD</given-names>
            </name>
            <name name-style="western">
              <surname>Breiman</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Friedman</surname>
              <given-names>JH</given-names>
            </name>
            <name name-style="western">
              <surname>Olshen</surname>
              <given-names>RA</given-names>
            </name>
            <name name-style="western">
              <surname>Stone</surname>
              <given-names>CJ</given-names>
            </name>
          </person-group>
          <article-title>Classification and Regression Trees</article-title>
          <source>Biometrics</source>
          <year>1984</year>
          <month>09</month>
          <volume>40</volume>
          <issue>3</issue>
          <fpage>874</fpage>
          <pub-id pub-id-type="doi">10.2307/2530946</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref78">
        <label>78</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Reiter</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Using CART to generate partially synthetic, public use microdata</article-title>
          <source>Journal of Official Statistics</source>
          <year>2005</year>
          <volume>21</volume>
          <issue>3</issue>
          <fpage>441</fpage>
          <lpage>462</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref79">
        <label>79</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Conversano</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Siciliano</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>Incremental Tree-Based Missing Data Imputation with Lexicographic Ordering</article-title>
          <source>J Classif</source>
          <year>2010</year>
          <month>1</month>
          <day>7</day>
          <volume>26</volume>
          <issue>3</issue>
          <fpage>361</fpage>
          <lpage>379</lpage>
          <pub-id pub-id-type="doi">10.1007/s00357-009-9038-8</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref80">
        <label>80</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Conversano</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Siciliano</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>Tree based classifiers for conditional incremental missing data imputation</article-title>
          <source>Department of Mathematics and Statistics, University of Naples</source>
          <year>2002</year>
          <access-date>2020-05-11</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://erin.it.jyu.fi/dataclean/abstracts/node25.html">http://erin.it.jyu.fi/dataclean/abstracts/node25.html</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref81">
        <label>81</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Arslan</surname>
              <given-names>RC</given-names>
            </name>
            <name name-style="western">
              <surname>Schilling</surname>
              <given-names>KM</given-names>
            </name>
            <name name-style="western">
              <surname>Gerlach</surname>
              <given-names>TM</given-names>
            </name>
            <name name-style="western">
              <surname>Penke</surname>
              <given-names>L</given-names>
            </name>
          </person-group>
          <article-title>Using 26,000 diary entries to show ovulatory changes in sexual desire and behavior</article-title>
          <source>J Pers Soc Psychol</source>
          <year>2018</year>
          <month>08</month>
          <day>27</day>
          <fpage>1</fpage>
          <lpage>48</lpage>
          <pub-id pub-id-type="doi">10.1037/pspp0000208</pub-id>
          <pub-id pub-id-type="medline">30148371</pub-id>
          <pub-id pub-id-type="pii">2018-41799-001</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref82">
        <label>82</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Bonnéry</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Feng</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Henneberger</surname>
              <given-names>AK</given-names>
            </name>
            <name name-style="western">
              <surname>Johnson</surname>
              <given-names>TL</given-names>
            </name>
            <name name-style="western">
              <surname>Lachowicz</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Rose</surname>
              <given-names>BA</given-names>
            </name>
            <name name-style="western">
              <surname>Shaw</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Stapleton</surname>
              <given-names>LM</given-names>
            </name>
            <name name-style="western">
              <surname>Woolley</surname>
              <given-names>ME</given-names>
            </name>
            <name name-style="western">
              <surname>Zheng</surname>
              <given-names>Y</given-names>
            </name>
          </person-group>
          <article-title>The Promise and Limitations of Synthetic Data as a Strategy to Expand Access to State-Level Multi-Agency Longitudinal Data</article-title>
          <source>Journal of Research on Educational Effectiveness</source>
          <year>2019</year>
          <month>08</month>
          <day>02</day>
          <volume>12</volume>
          <issue>4</issue>
          <fpage>616</fpage>
          <lpage>647</lpage>
          <pub-id pub-id-type="doi">10.1080/19345747.2019.1631421</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref83">
        <label>83</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Sabay</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Harris</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Bejugama</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Jaceldo-Siegl</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>Overcoming Small Data Limitations in Heart Disease Prediction by Using Surrogate Data</article-title>
          <source>SMU Data Science Review</source>
          <year>2018</year>
          <volume>1</volume>
          <issue>3</issue>
          <fpage>12</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://scholar.smu.edu/datasciencereview/vol1/iss3/12"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref84">
        <label>84</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Freiman</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Lauger</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Reiter</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Data Synthesis and Perturbation for the American Community Survey at the U.S. Census Bureau</article-title>
          <source>US Census Bureau</source>
          <year>2017</year>
          <access-date>2020-05-05</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.census.gov/content/dam/Census/library/working-papers/2018/adrm/2017%20Data%20Synthesis%20and%20Perturbation%20for%20ACS.pdf">https://www.census.gov/content/dam/Census/library/working-papers/2018/adrm/2017%20Data%20Synthesis%20and%20Perturbation%20for%20ACS.pdf</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref85">
        <label>85</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Nowok</surname>
              <given-names>B</given-names>
            </name>
          </person-group>
          <article-title>Utility of synthetic microdata generated using tree-based methods</article-title>
          <source>Administrative Data Research Centre, University of Edinburgh</source>
          <year>2015</year>
          <access-date>2020-05-11</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.unece.org/fileadmin/DAM/stats/documents/ece/ces/ge.46/20150/Paper_33_Session_2_-_Univ._Edinburgh__Nowok_.pdf">https://www.unece.org/fileadmin/DAM/stats/documents/ece/ces/ge.46/20150/Paper_33_Session_2_-_Univ._Edinburgh__Nowok_.pdf</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref86">
        <label>86</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Raab</surname>
              <given-names>GM</given-names>
            </name>
            <name name-style="western">
              <surname>Nowok</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Dibben</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>Practical Data Synthesis for Large Samples</article-title>
          <source>JPC</source>
          <year>2018</year>
          <month>02</month>
          <day>02</day>
          <volume>7</volume>
          <issue>3</issue>
          <fpage>67</fpage>
          <lpage>97</lpage>
          <pub-id pub-id-type="doi">10.29012/jpc.v7i3.407</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref87">
        <label>87</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Nowok</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Raab</surname>
              <given-names>GM</given-names>
            </name>
            <name name-style="western">
              <surname>Dibben</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>Providing bespoke synthetic data for the UK Longitudinal Studies and other sensitive data with the synthpop package for R1</article-title>
          <source>SJI</source>
          <year>2017</year>
          <month>08</month>
          <day>21</day>
          <volume>33</volume>
          <issue>3</issue>
          <fpage>785</fpage>
          <lpage>796</lpage>
          <pub-id pub-id-type="doi">10.3233/sji-150153</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref88">
        <label>88</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Quintana</surname>
              <given-names>Daniel S</given-names>
            </name>
          </person-group>
          <article-title>A synthetic dataset primer for the biobehavioural sciences to promote reproducibility and hypothesis generation</article-title>
          <source>eLife</source>
          <year>2020</year>
          <month>03</month>
          <day>11</day>
          <volume>9</volume>
          <fpage>e53275</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.7554/eLife.53275"/>
          </comment>
          <pub-id pub-id-type="doi">10.7554/eLife.53275</pub-id>
          <pub-id pub-id-type="medline">32159513</pub-id>
          <pub-id pub-id-type="pii">53275</pub-id>
          <pub-id pub-id-type="pmcid">PMC7112950</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref89">
        <label>89</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hothorn</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Hornik</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Zeileis</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Unbiased Recursive Partitioning: A Conditional Inference Framework</article-title>
          <source>Journal of Computational and Graphical Statistics</source>
          <year>2006</year>
          <month>09</month>
          <volume>15</volume>
          <issue>3</issue>
          <fpage>651</fpage>
          <lpage>674</lpage>
          <pub-id pub-id-type="doi">10.1198/106186006X133933</pub-id>
        </nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
