<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article" dtd-version="2.0">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JMIR</journal-id>
      <journal-id journal-id-type="nlm-ta">J Med Internet Res</journal-id>
      <journal-title>Journal of Medical Internet Research</journal-title>
      <issn pub-type="epub">1438-8871</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
      <article-id pub-id-type="publisher-id">v22i11e18735</article-id>
      <article-id pub-id-type="pmid">33141090</article-id>
      <article-id pub-id-type="doi">10.2196/18735</article-id>
      <article-categories>
        <subj-group subj-group-type="heading">
          <subject>Original Paper</subject>
        </subj-group>
        <subj-group subj-group-type="article-type">
          <subject>Original Paper</subject>
        </subj-group>
      </article-categories>
      <title-group>
        <article-title>Balancing Accuracy and Privacy in Federated Queries of Clinical Data Repositories: Algorithm Development and Validation</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="editor">
          <name>
            <surname>Eysenbach</surname>
            <given-names>Gunther</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Visweswaran</surname>
            <given-names>Shyam</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Hochheiser</surname>
            <given-names>Harry</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Farzi</surname>
            <given-names>Jebraeil</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Mourby</surname>
            <given-names>Miranda </given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib id="contrib1" contrib-type="author">
          <name name-style="western">
            <surname>Yu</surname>
            <given-names>Yun William</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-8275-9576</ext-link>
        </contrib>
        <contrib id="contrib2" contrib-type="author" corresp="yes">
          <name name-style="western">
            <surname>Weber</surname>
            <given-names>Griffin M</given-names>
          </name>
          <degrees>MD, PhD</degrees>
          <xref rid="aff2" ref-type="aff">2</xref>
          <address>
            <institution>Department of Biomedical Informatics</institution>
            <institution>Harvard Medical School</institution>
            <addr-line>10 Shattuck St</addr-line>
            <addr-line>Boston, MA, 02115</addr-line>
            <country>United States</country>
            <phone>1 617 432 6134</phone>
            <email>weber@hms.harvard.edu</email>
          </address>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-2597-881X</ext-link>
        </contrib>
      </contrib-group>
      <aff id="aff1">
        <label>1</label>
        <institution>Computer &#38; Mathematical Sciences</institution>
        <institution>University of Toronto</institution>
        <addr-line>Toronto, ON</addr-line>
        <country>Canada</country>
      </aff>
      <aff id="aff2">
        <label>2</label>
        <institution>Department of Biomedical Informatics</institution>
        <institution>Harvard Medical School</institution>
        <addr-line>Boston, MA</addr-line>
        <country>United States</country>
      </aff>
      <author-notes>
        <corresp>Corresponding Author: Griffin M Weber <email>weber@hms.harvard.edu</email></corresp>
      </author-notes>
      <pub-date pub-type="collection">
        <month>11</month>
        <year>2020</year>
      </pub-date>
      <pub-date pub-type="epub">
        <day>3</day>
        <month>11</month>
        <year>2020</year>
      </pub-date>
      <volume>22</volume>
      <issue>11</issue>
      <elocation-id>e18735</elocation-id>
      <history>
        <date date-type="received">
          <day>15</day>
          <month>3</month>
          <year>2020</year>
        </date>
        <date date-type="rev-request">
          <day>4</day>
          <month>7</month>
          <year>2020</year>
        </date>
        <date date-type="rev-recd">
          <day>28</day>
          <month>8</month>
          <year>2020</year>
        </date>
        <date date-type="accepted">
          <day>7</day>
          <month>9</month>
          <year>2020</year>
        </date>
      </history>
      <copyright-statement>©Yun William Yu, Griffin M Weber. Originally published in the Journal of Medical Internet Research (http://www.jmir.org), 03.11.2020.</copyright-statement>
      <copyright-year>2020</copyright-year>
      <license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
        <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (https://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in the Journal of Medical Internet Research, is properly cited. The complete bibliographic information, a link to the original publication on http://www.jmir.org/, as well as this copyright and license information must be included.</p>
      </license>
      <self-uri xlink:href="https://www.jmir.org/2020/11/e18735" xlink:type="simple"/>
      <abstract>
        <sec sec-type="background">
          <title>Background</title>
          <p>Over the past decade, the emergence of several large federated clinical data networks has enabled researchers to access data on millions of patients at dozens of health care organizations. Typically, queries are broadcast to each of the sites in the network, which then return aggregate counts of the number of matching patients. However, because patients can receive care from multiple sites in the network, simply adding the numbers frequently double counts patients. Various methods such as the use of trusted third parties or secure multiparty computation have been proposed to <italic>link</italic> patient records across sites. However, they either have large trade-offs in accuracy and privacy or are not scalable to large networks.</p>
        </sec>
        <sec sec-type="objective">
          <title>Objective</title>
          <p>This study aims to enable accurate estimates of the number of patients matching a federated query while providing strong guarantees on the amount of protected medical information revealed.</p>
        </sec>
        <sec sec-type="methods">
          <title>Methods</title>
          <p>We introduce a novel probabilistic approach to running federated network queries. It combines an algorithm called HyperLogLog with obfuscation in the form of hashing, masking, and homomorphic encryption. It is <italic>tunable</italic>, in that it allows networks to balance accuracy versus privacy, and it is computationally efficient even for large networks. We built a user-friendly free open-source benchmarking platform to simulate federated queries in large hospital networks. Using this platform, we compare the accuracy, <italic>k</italic>-anonymity privacy risk (with <italic>k</italic>=10), and computational runtime of our algorithm with several existing techniques.</p>
        </sec>
        <sec sec-type="results">
          <title>Results</title>
          <p>In simulated queries matching 1 to 100 million patients in a 100-hospital network, our method was significantly more accurate than adding aggregate counts while maintaining <italic>k</italic>-anonymity. On average, it required a total of 12 kilobytes of data to be sent to the network hub and added only 5 milliseconds to the overall federated query runtime. This was orders of magnitude better than other approaches, which guaranteed the exact answer.</p>
        </sec>
        <sec sec-type="conclusions">
          <title>Conclusions</title>
          <p>Using our method, it is possible to run highly accurate federated queries of clinical data repositories that both protect patient privacy and scale to large networks.</p>
        </sec>
      </abstract>
      <kwd-group>
        <kwd>algorithms</kwd>
        <kwd>medical records</kwd>
        <kwd>privacy</kwd>
        <kwd>information storage and retrieval</kwd>
        <kwd>medical record linkage</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <sec sec-type="introduction">
      <title>Introduction</title>
      <sec>
        <title>Background</title>
        <p>Widespread adoption of electronic health records has generated vast amounts of data, which are increasingly being used in clinical, epidemiological, and public health research [<xref ref-type="bibr" rid="ref1">1</xref>]. Data from multiple health care organizations are often needed to increase statistical power or to access diverse patient populations and geographic regions. Although it is possible to combine patient-level data from multiple sites into a secure central repository for analysis, there are often significant technical and regulatory barriers to doing this in a way that ensures patient privacy. Institutions must compare the benefit of centralized data for research with the risk of violating the Health Insurance Portability and Accountability Act (HIPAA) and other privacy laws as a result of unintended disclosure of patient data. An alternative approach is to create federated clinical data research networks, which broadcast queries to multiple sites, run analyses locally, and then combine the results. In this way, sites retain control over their patient data. Two of the largest networks in the United States are the Patient-Centered Outcomes Research Network (PCORnet) [<xref ref-type="bibr" rid="ref2">2</xref>] and the National Institutes of Health (NIH)–funded Accrual to Clinical Trials (ACT) network [<xref ref-type="bibr" rid="ref3">3</xref>-<xref ref-type="bibr" rid="ref5">5</xref>], both of which connect dozens of health care organizations across the country and include health data on nearly 100 million Americans.</p>
        <p>As patients often receive care at more than one clinical site, the data for a patient at any one site might not be complete, and the same information about a patient might be duplicated at different sites. This can lead to queries returning incorrect results. This problem is amplified when the sites in the network are geographically close and there is greater overlap in their patient populations. However, because patients move or travel, sometimes across state or country borders, even far apart sites might share patients. A similar situation arises when patients’ data are intentionally separated for technical reasons, such as when large amounts of clinical data (eg, diagnoses and medications) and genomic data are stored in different locations, and it is not feasible to merge them into a single database. In both cases, computation must be performed in a distributed fashion, but the challenge is that an individual patient’s data may be spread across multiple databases.</p>
        <p>Various methods to addressing this problem have been described in the literature, but they have different trade-offs in terms of accuracy, privacy, scalability, and computational complexity. We grouped these into 3 broad categories: aggregate counts, hashed patient identifiers, and privacy-guaranteed methods (<xref rid="figure1" ref-type="fig">Figure 1</xref>).</p>
        <fig id="figure1" position="float">
          <label>Figure 1</label>
          <caption>
            <p>Federated query methods. We classify methods for merging distributed queries into 3 groups: (top) sharing aggregate counts, (middle) sending full hashed patient identifiers, and (bottom) generating bitstrings (displayed as hexadecimal) that do not directly correspond to individual patients but can be merged together. HLL: HyperLogLog; MPC: multiparty computation; SSN: social security number.</p>
          </caption>
          <graphic xlink:href="jmir_v22i11e18735_fig1.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <sec>
          <title>Aggregate Counts</title>
          <p>Federated queries in PCORNet and ACT ask sites to return the number of patients in their local databases who match some set of criteria, such as having both hypertension and diabetes. The networks present the user with the aggregate count from each site, and no attempt is made to link patients across sites or deduplicate records. This can lead to large overestimates of the number of distinct patients who match a query if the counts from each site are naively summed [<xref ref-type="bibr" rid="ref6">6</xref>]. To protect patient privacy, the networks mask small counts by displaying <italic>≤10 patients</italic>. However, it is possible to combine results from multiple queries to reveal information about individual patients (see the <italic>Methods</italic> section for details). Sites participating in these networks are aware of this privacy risk, which they mitigate through institutional agreements that require sites to audit researchers’ queries and monitor their use of the network.</p>
        </sec>
        <sec>
          <title>Hashed Patient Identifiers</title>
          <p>The most accurate and semisecure method to deduplicate the results in a federated query is for each site to return the full list of patients who match the query. Privacy is the main concern, as data on every patient matching the query (potentially many millions of people) must be shared. Patient identifiers (eg, name and date of birth) [<xref ref-type="bibr" rid="ref7">7</xref>] are typically encrypted using a one-way hash function, such as Secure Hash Algorithm 1 (SHA-1) [<xref ref-type="bibr" rid="ref8">8</xref>]. The same patient at two sites will be hashed to the same value if the same hash function is used (and there are no inconsistencies in the underlying demographic data). Unfortunately, hash functions are vulnerable to dictionary or linkage attacks, where an adversary who knows the encryption method can simply generate a rainbow table of the hashes of many possible patient identifiers (eg, exhaustively searching all 9-digit social security numbers or accessing public voter registration lists) and then use this to reidentify the list of hash values returned by a site [<xref ref-type="bibr" rid="ref9">9</xref>].</p>
        </sec>
        <sec>
          <title>Privacy-Guaranteed Methods</title>
          <p>Secure multiparty computation (MPC) and homomorphic encryption techniques enable true privacy guarantees in a federated network (see the <italic>Methods</italic> section) and have recently been introduced for distributed genome-wide association studies [<xref ref-type="bibr" rid="ref10">10</xref>] and pharmacological collaboration [<xref ref-type="bibr" rid="ref11">11</xref>]. The limitation of these algorithms is their computational complexity. Protocols that securely determine the number of shared patients between two sites [<xref ref-type="bibr" rid="ref6">6</xref>,<xref ref-type="bibr" rid="ref12">12</xref>-<xref ref-type="bibr" rid="ref14">14</xref>] are impractical for large networks because the number of pairwise and multiway comparisons grows exponentially with the number of sites. Other approaches that avoid exponential comparison either require sharing gigabytes of data [<xref ref-type="bibr" rid="ref15">15</xref>], making numerous rounds of back-and-forth communication [<xref ref-type="bibr" rid="ref16">16</xref>], or using trusted third parties [<xref ref-type="bibr" rid="ref17">17</xref>]. These are also problematic because, as we have previously shown [<xref ref-type="bibr" rid="ref18">18</xref>], large federated clinical data networks are fragile, with multiple sites typically failing to respond even to aggregate count queries.</p>
        </sec>
      </sec>
      <sec>
        <title>HyperLogLog Sketch</title>
        <p>In this paper, we propose a new method for combining data from sites in a federated clinical data network, based on the HyperLogLog (HLL) probabilistic sketching algorithm [<xref ref-type="bibr" rid="ref19">19</xref>]. A probabilistic sketch is a small data structure that summarizes large amounts of data. A calculation can run on the sketch to obtain a fast, accurate estimate of what the result would be on the original data. Although HLL is widely used in many software programs, such as internet search engines, to our knowledge, it has not been applied to federated queries of health data.</p>
        <p>The basic idea behind HLL (and other minimum value sketches) [<xref ref-type="bibr" rid="ref20">20</xref>] is that the minimum of a collection of random numbers between 0 and 1 is inversely proportional to how many numbers are present. For example, a single random number between 0 and 1 has an expected value of 0.5; however, if we have 99 random numbers, the minimum has an expected value of 0.01. By using a hash function that maps patients to a random number between 0 and 1, we can estimate the number of patients who match a query at a site by keeping track of just the minimum hash value of the matching patients. If the minimum hash value is <italic>v</italic>, then the estimated number of patients is (1/<italic>v</italic>)-1. Although the accuracy of this estimate is poor, the method can be improved by using <italic>t</italic> different hash functions to generate <italic>t</italic> independent estimates of the number of patients. The average of these results in a more accurate overall estimate. The set of <italic>t</italic> minimum hash values is the sketch.</p>
        <p>If each site in a network uses the same hash function and returns its minimal hash value, then we can estimate the number of distinct patients in the whole network that match the query from the smallest of those values. Although it may seem unintuitive that the network minimum hash is the same as the hash for one hospital, the hospital which the minimum hash corresponds to changes when multiple hash functions are used, allowing the estimator to be accurate.</p>
        <p>Instead of using <italic>t</italic> hash functions, HLL improves the accuracy of this method by using a single hash function but efficiently dividing the patients into <italic>t</italic> partitions and returning the minimum hash value of patients in each partition. HLL also returns the position of the leading one indicator in the binary expansion of the minimum values rather than the actual values. This only has a small effect on accuracy; however, it greatly reduces the risk of reidentification from a dictionary attack. For <italic>t</italic> partitions, the relative error of HLL is approximately 1/sqrt(<italic>t</italic>). For example, by asking sites to share an HLL sketch with only 100 values, the number of distinct patients can be estimated with a 10% relative error. The error can be reduced by increasing <italic>t</italic>. Although <italic>higher t</italic> increases the risk of reidentification, the risk is quantifiable and predictable, enabling networks to define policies that maximize accuracy while reducing risk to an acceptable level.</p>
      </sec>
      <sec>
        <title>Objectives</title>
        <p>We aim to enable accurate estimates of the number of unique patients matching a federated query while providing strong guarantees on the amount of protected medical information revealed.</p>
      </sec>
      <sec>
        <title>Structure of This Paper</title>
        <p>In the <italic>Methods</italic> section, we first show how sites can generate a privacy-preserving HLL sketch of the patients who match a query and how the shared sketches from sites can be combined to estimate the number of unique patients in the network who match the query. We then describe several <italic>obfuscation</italic> approaches that further reduce the privacy risk of aggregate counts, hashed identifiers, and HLL sketches. These include methods that might result in a loss of information or an increase in computational complexity to make it more difficult or impossible for an adversary to identify patients. In the <italic>Results</italic> section, we test our algorithm and other methods using simulated networks of different sizes and degrees of patient overlap. We compare them along several dimensions, including accuracy, privacy risk, computation time, and amount of data shared. Finally, in the <italic>Discussion</italic> section, we summarize the trade-offs and limitations of the algorithms and provide recommendations on when networks should consider using HLL sketches.</p>
      </sec>
    </sec>
    <sec sec-type="methods">
      <title>Methods</title>
      <sec>
        <title>Algorithms and Obfuscation Techniques for Federated Queries</title>
        <p>Here, we describe the algorithms we compared. The basic model assumes that a researcher at one hospital in the network sends a query of the form <italic>How many unique patients have condition X across the hospital network?</italic> to a central network hub. The hub then distributes the query to all the hospitals in the network. The hospitals determine which of their patients match the query and return a result (the form of this result varies by algorithm) to the hub. The hub combines the results and returns an estimate of the total number of unique patients to the researcher. The name of each algorithm combines the base method (<italic>Count</italic>, <italic>HashedIDs</italic>, or <italic>HLL</italic>) and any additional obfuscation (<italic>Mask</italic>, <italic>MPC</italic>, <italic>Rehash</italic>, or <italic>Shuffle</italic>).</p>
        <sec>
          <title>Count</title>
          <p>Each hospital runs the researcher’s query locally and sends the hub a single count of the number of matching patients. The hub returns 2 numbers: (1) the maximum count from a hospital and (2) the sum of counts from all hospitals. The maximum count corresponds to a lower bound on the result, because even in the event of significant overlapping patients between hospitals, there are at least as many unique patients across the network as there are at a single hospital. For example, in <xref rid="figure1" ref-type="fig">Figure 1</xref>, hospitals 1, 2, and 3 have 100,000, 80,000, and 50,000 patients, respectively. It might be the case that all patients at hospitals 2 and 3 are also patients at hospital 1, which has the maximum count. However, this is not possible for the hospitals with smaller counts. For example, out of 100,000 patients of hospital 1, at most 80,000 can also be patients at hospital 2. The sum of all counts is obviously an upper bound, although it might be a substantial overestimate when there is a significant overlap between hospitals. Conversely, the maximum of all counts is obviously a lower bound.</p>
        </sec>
        <sec>
          <title>Count+Mask</title>
          <p>The procedure is identical to Count, except that if the actual count of a hospital is between 1 through 9 inclusive, the hospital returns 10 to the hub instead. This masking procedure ensures that no nonzero number corresponds to fewer than 10 patients, ensuring 10-anonymity. Both the PCORNet and ACT networks use Count+Mask. ACT further obfuscates the result by adding a small random number between –10 and +10 to the actual count [<xref ref-type="bibr" rid="ref4">4</xref>]; however, we ignore this in our analyses.</p>
        </sec>
        <sec>
          <title>Count+MPC</title>
          <p>This protocol is based on the ElGamal cryptosystem [<xref ref-type="bibr" rid="ref21">21</xref>] using a distributed private key to ensure that no one party can decrypt intermediate data. Only the final sum is decrypted. The individual hospital counts are hidden, even if all hospitals but one and the hub are compromised. The major disadvantage is that the MPC requires all hospitals to respond before any answer can be given. In large networks, it is likely that some hospitals will either be slow to respond or not respond at all [<xref ref-type="bibr" rid="ref18">18</xref>], which limits this protocol to only small networks in practice (for additional information on our MPC implementation, see <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> [<xref ref-type="bibr" rid="ref6">6</xref>,<xref ref-type="bibr" rid="ref7">7</xref>,<xref ref-type="bibr" rid="ref10">10</xref>-<xref ref-type="bibr" rid="ref12">12</xref>,<xref ref-type="bibr" rid="ref14">14</xref>-<xref ref-type="bibr" rid="ref17">17</xref>,<xref ref-type="bibr" rid="ref21">21</xref>-<xref ref-type="bibr" rid="ref24">24</xref>]).</p>
        </sec>
        <sec>
          <title>HashedIDs</title>
          <p>Each hospital runs the query locally, producing a list of matching patient IDs. Each hospital needs to use the same process for constructing IDs so that the same patient at different hospitals will have the same ID. As there is no universal patient identifier, the ID should be based on information likely to be unique to the patient and available at all hospitals, such as the concatenation of the patient’s first name, last name, and date of birth [<xref ref-type="bibr" rid="ref7">7</xref>] (for additional details and limitations of generating a patient ID, see <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> [<xref ref-type="bibr" rid="ref6">6</xref>,<xref ref-type="bibr" rid="ref7">7</xref>,<xref ref-type="bibr" rid="ref10">10</xref>-<xref ref-type="bibr" rid="ref12">12</xref>,<xref ref-type="bibr" rid="ref14">14</xref>-<xref ref-type="bibr" rid="ref17">17</xref>,<xref ref-type="bibr" rid="ref21">21</xref>-<xref ref-type="bibr" rid="ref24">24</xref>]). Patient IDs are encrypted using a one-way hash function. For our simulations, we used SHA-1, but in practice, a newer, more secure hash function should be used. The list of hashed IDs is then sent back to the hub. The hub then counts the number of distinct hashed IDs received from all sites and returns this as the exact answer to the query. Sites can precompute the hashed IDs for all of their patients to improve the performance of queries. Note that because <italic>HashedIDs</italic> uses the same hash function for all queries, a dictionary or linkage attack by the hub has a high likelihood of success.</p>
        </sec>
        <sec>
          <title>HashedIDs+Rehash</title>
          <p>This is identical to <italic>HashedIDs</italic>, except that the originating hospital (the hospital with the researcher who ran the query) also sends the hub a random string encrypted with the public keys of each of the other hospitals (using any kind of standard off-the-shelf asymmetric key encryption, as used in protocols such as Rivest-Shamir-Adleman [RSA] and Hypertext Transfer Protocol Secure [HTTPS]). Each hospital rehashes all the patients, prepending the random string before running it through SHA-1. By doing so, because the hub does not know the random prefix string, it cannot perform a dictionary attack to reverse the hash function, and thus, all patients get 10-anonymity. Of course, rehashing all patients with each query requires additional computational time.</p>
        </sec>
        <sec>
          <title>HLL</title>
          <p>A graphical overview of HLL is shown in <xref rid="figure2" ref-type="fig">Figure 2</xref>. Like HashedIDs, in HLL, the hospital uses the SHA-1 hash function to produce a 160-bit pseudorandom number for each patient that matches a query. The first 64 bits are interpreted as an integer <italic>B</italic>, and the patient is put into bucket <italic>B</italic> mod <italic>t</italic>, where <italic>t</italic> is the number of buckets. The hospital then finds the position <italic>V</italic> of the first bit set to 1 in bits 65 to 128 of the SHA-1 string. Within each bucket, the hospital stores the largest value <italic>V</italic> corresponding to a patient. The list of bucket values is the HLL sketch from that hospital. (Note that like <italic>HashedIDs</italic>, hospitals can precompute the buckets <italic>B</italic> and values <italic>V</italic> for all of their patients, so that this step does not have to be repeated for each query.)</p>
          <fig id="figure2" position="float">
            <label>Figure 2</label>
            <caption>
              <p>HLL sketches. (a) To create an HLL sketch, we first hash a set of identifiers for the matching patients (eg, social security number) to binary strings. The first several bits of each binary string are used to bucket the values, and then within each bucket, we store the position of the leading one indicator of the minimum value. (b) HLL sketches from different hospitals are merged by simply taking, within each bucket, the maximum value across sketches. (c) Given a list of buckets, we can estimate the cardinality. HLL: HyperLogLog; SSN: social security number.</p>
            </caption>
            <graphic xlink:href="jmir_v22i11e18735_fig2.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </fig>
          <p>The hospitals send these HLL sketches to the central hub. The hub combines the sketches by taking the maximum within each bucket across the hospital sketches, generating a sketch of the union. The hub then estimates the cardinality <italic>C</italic> of the union sketch using the standard HLL estimator [<xref ref-type="bibr" rid="ref19">19</xref>]. The hub also provides a 95% CI by using the fact that the SD of the estimate is around 1/sqrt(<italic>t</italic>), so 1±1.96/sqrt(<italic>t</italic>) gives the lower and upper bounds of a 95% CI.</p>
        </sec>
        <sec>
          <title>HLL+Mask</title>
          <p>As shown in <xref rid="figure3" ref-type="fig">Figure 3</xref>, this algorithm is identical to HLL, except that the hospital precomputes a list of bucket values that are less than 10-anonymous. If after generating the HLL sketch corresponding to the query, a hospital sees that there is a bucket that is not 10-anonymous, the hospital aborts and reverts to the <italic>Count+Mask</italic> algorithm, where only a single (possibly masked) aggregate count is returned. The hub thus receives a combination of sketches and masked counts.</p>
          <fig id="figure3" position="float">
            <label>Figure 3</label>
            <caption>
              <p>Applying obfuscation to HLL sketches. (a) HLL+Mask: For each bucket, we count the total number of patients (not just the ones who match the query) whose hashes have the same leading 1-indicator. If that number is less than 10, then the bucket is not 10-anonymous, so we do not send the HLL sketch. Instead, we only send a masked aggregate count of the number of patients matching the query. (b) HLL+Shuffle: We do a coordinated random shuffling so the central hub does not know what the original buckets were for the leading 1 indicator. However, the hub can still estimate cardinality in the same way as HLL without obfuscation. HLL: HyperLogLog.</p>
            </caption>
            <graphic xlink:href="jmir_v22i11e18735_fig3.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </fig>
          <p>The hub combines the sketches together using the HLL cardinality estimator to obtain an estimate of the count of the union of all the hospitals that sent sketches with appropriate 95% error bounds. From that, the hub goes through something similar to <italic>Count</italic>. The hub returns 2 numbers: the sum of all raw hospital counts plus the 95% CI maximum for the HLL union count, which gives an upper bound, and the maximum of the set of raw counts or the 95% CI minimum for the HLL union, which gives a lower bound.</p>
        </sec>
        <sec>
          <title>HLL+Rehash</title>
          <p>This algorithm uses HLL but with an obfuscation method similar to <italic>HashedID+Rehash</italic>. When the originating hospital sends a query to the hub, it sends both a query and a random string encrypted with public keys of each of the other hospitals in the network. The hospitals completely regenerate the HLL sketch while prepending the random string to the patient IDs before hashing. Although this procedure takes more time, the hub cannot use a dictionary attack at all because it does not know the random string. Thus, all patients are guaranteed 10-anonymity if the random string is not revealed to the hub.</p>
        </sec>
        <sec>
          <title>HLL+Shuffle</title>
          <p>This algorithm also sends a random string encrypted with public keys of each of the other hospitals in the network to the hub. However, it is much faster than <italic>HLL+Rehash</italic> because it avoids having to rehash all patients. Each hospital first creates an ordinary HLL sketch using their precomputed hashed IDs. It then shuffles the ordering of the buckets using the random string to determine the sort order and then sends the shuffled sketch to the hub (<xref rid="figure3" ref-type="fig">Figure 3</xref>).</p>
          <p>As every hospital uses the same permutation, the sketches can still be combined and the normal estimators can be used. However, the hub, without knowing the random string, cannot know which bucket in the original sketch corresponds to a bucket in the shuffled sketch. Normally, an HLL bucket is less than 10-anonymous if that value+bucket pair corresponds to fewer than 10 individuals at the hospital. With shuffling, an HLL bucket is less than 10-anonymous only if that value corresponds to fewer than 10 individuals at the hospital. On average, this decreases the risk by dividing the risk score by the number of buckets. In other words, the buckets partition the patient population into smaller, more identifiable groups. By shuffling the buckets, it is no longer known which partition the value came from, which makes the value less identifiable.</p>
        </sec>
        <sec>
          <title>HLL+MPC</title>
          <p>Like <italic>Count+MPC</italic>, this method is based on the ElGamal homomorphic cryptosystem, and we use the same primitives as in that method (with the same security guarantees). We additionally take inspiration from a previous paper applying MPC to a Flajolet-Martin style approximate counter [<xref ref-type="bibr" rid="ref16">16</xref>]. The key setup, exchange, encryption and decryption routines are identical to those of <italic>Count+MPC</italic> (for additional information on our MPC implementation, see <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> [<xref ref-type="bibr" rid="ref6">6</xref>,<xref ref-type="bibr" rid="ref7">7</xref>,<xref ref-type="bibr" rid="ref10">10</xref>-<xref ref-type="bibr" rid="ref12">12</xref>,<xref ref-type="bibr" rid="ref14">14</xref>-<xref ref-type="bibr" rid="ref17">17</xref>,<xref ref-type="bibr" rid="ref21">21</xref>-<xref ref-type="bibr" rid="ref24">24</xref>]).</p>
        </sec>
        <sec>
          <title>HLL+Shuffle+MPC</title>
          <p>This procedure is simply a combination of <italic>HLL+Shuffle</italic> and <italic>HLL+MPC</italic>. Each hospital simply shuffles their buckets according to the random string before performing the encryption. The rest of the procedure is identical to that of <italic>HLL+MPC</italic>.</p>
        </sec>
      </sec>
      <sec>
        <title>Testing and Evaluating the Algorithms</title>
        <p>To quantitatively measure privacy loss, we used an adapted <italic>k</italic>-anonymity model of privacy, whereby the privacy risk is defined to be the number of revealed data points that correspond to fewer than <italic>k</italic>=10 patients [<xref ref-type="bibr" rid="ref22">22</xref>,<xref ref-type="bibr" rid="ref25">25</xref>] (for details on the privacy risk score, see <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> [<xref ref-type="bibr" rid="ref6">6</xref>,<xref ref-type="bibr" rid="ref7">7</xref>,<xref ref-type="bibr" rid="ref10">10</xref>-<xref ref-type="bibr" rid="ref12">12</xref>,<xref ref-type="bibr" rid="ref14">14</xref>-<xref ref-type="bibr" rid="ref17">17</xref>,<xref ref-type="bibr" rid="ref21">21</xref>-<xref ref-type="bibr" rid="ref24">24</xref>]). We ran benchmarks for runtime, accuracy, and privacy loss on (1) shared aggregate counts (<italic>Count</italic> and <italic>Count+Mask</italic>), (2) shared hashed identifiers (<italic>HashedIDs</italic>), and (3) our proposed HLL approach. Each of these was paired with various obfuscation techniques of masking, rehashing, shuffling, and MPC. HLL was tested using different number of buckets or values in the sketch. We indicate the size of the sketch, <italic>t</italic>, with a number after <italic>HLL</italic>, such that <italic>HLLN</italic> means 2<sup>N</sup> values. For example, <italic>t</italic>=2<sup>1</sup>=2 (<italic>HLL1</italic>), <italic>t</italic>=2<sup>4</sup>=16 (<italic>HLL4</italic>), <italic>t</italic>=2<sup>7</sup>=128 (<italic>HLL7</italic>), and <italic>t</italic>=2<sup>15</sup>=32,768 (<italic>HLL15</italic>). Although <italic>Count+MPC</italic> uses a standard MPC privacy-guaranteed cryptosystem, we implemented our own protocols for the HLL+MPC variants using ElGamal encryption [<xref ref-type="bibr" rid="ref21">21</xref>] and a private equality test [<xref ref-type="bibr" rid="ref23">23</xref>]. We did not run benchmarks for other existing privacy-guaranteed methods because they do not scale well and are infeasible for running on large data sets, with either extremely high runtime or error (for descriptions of several of these algorithms and their limitations, see <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> [<xref ref-type="bibr" rid="ref6">6</xref>,<xref ref-type="bibr" rid="ref7">7</xref>,<xref ref-type="bibr" rid="ref10">10</xref>-<xref ref-type="bibr" rid="ref12">12</xref>,<xref ref-type="bibr" rid="ref14">14</xref>-<xref ref-type="bibr" rid="ref17">17</xref>,<xref ref-type="bibr" rid="ref21">21</xref>-<xref ref-type="bibr" rid="ref24">24</xref>]).</p>
        <p>Due to patient privacy, we cannot test the algorithms using actual hospital data. Therefore, we developed software for generating simulated federated networks of hospitals spread geographically with highly varying sizes and overlap [<xref ref-type="bibr" rid="ref24">24</xref>] (for details on simulating a federated hospital network, see <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> [<xref ref-type="bibr" rid="ref6">6</xref>,<xref ref-type="bibr" rid="ref7">7</xref>,<xref ref-type="bibr" rid="ref10">10</xref>-<xref ref-type="bibr" rid="ref12">12</xref>,<xref ref-type="bibr" rid="ref14">14</xref>-<xref ref-type="bibr" rid="ref17">17</xref>,<xref ref-type="bibr" rid="ref21">21</xref>-<xref ref-type="bibr" rid="ref24">24</xref>]). We ran our benchmarks on simulated networks containing up to 100 million total distinct patients, distributed across 100 hospitals. In the simulations, patients on average received care at 2 hospitals. However, this number varies and hospitals that are geographically close in the simulations are modeled to have a larger number of shared patients.</p>
        <p>The benchmarks were run on an 8-core AMD Ryzen 1700 processor with 16 GB of RAM running Ubuntu 18.04.2 Long Term Support. We measured the wall-clock time for each pipeline component for time complexity and serialized bitstrings in each communication round for transmission space complexity. We provide all code in GitHub [<xref ref-type="bibr" rid="ref26">26</xref>].</p>
      </sec>
    </sec>
    <sec sec-type="results">
      <title>Results</title>
      <sec>
        <title>Quantitative Simulation Benchmark Results</title>
        <p><xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref> lists the detailed benchmark results for accuracy, privacy risk, and runtimes of queries matching 1, 10, 100, 1000, 10,000, 100,000, 1 million, 10 million, or 100 million patients using the different methods. As an example, <xref ref-type="table" rid="table1">Table 1</xref> shows a subset of rows from the table in <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref> corresponding only to queries matching 10,000 patients and HLL sketches with 2<sup>7</sup> (HLL7) and 2<sup>15</sup> (HLL15) values.</p>
        <p>Accuracy is described in absolute terms as the 95% CIs of the estimated number of patients who matched a query in 100 simulated experiments. More precisely, in each of the 100 runs, each estimator tries to return either its best guess or upper or lower bounds. If it returns a single best guess, then we report the 97.5 and 2.5 percentiles as the upper and lower bounds, respectively. If it returns upper or lower bounds, then we report the 97.5 percentile of the upper bound and the 2.5 percentile of the lower bound. These are then converted into relative errors by comparing them with the true number of distinct patients.</p>
        <p>Privacy risk is determined by counting the number of statistics (ie, a count, HLL bucket, or hash) that are not 10-anonymous revealed to either the hub or the hub colluding with a hospital. It relates to the number of patients who are <italic>potentially</italic> identifiable with a specific statistic, but it does not necessarily mean that an adversary will be able to identify a patient from a statistic. Therefore, it can be thought of as an upper bound on direct linkage risk. Note that this guarantee is applicable primarily for one common threat model. In the <italic>Discussion</italic> section, we will cover some other more sophisticated potential avenues for attack.</p>
        <p>Wait time is the additional computational time that hospitals require to generate the statistics plus the time the hub requires to combine each hospital’s results. (It does not include the time each hospital needs to run the query.) For the same query, hospitals might have different wait times based on the number of matching patients. We, therefore, report both <italic>mean wait time</italic>, which is the average hospital computation time+hub computation time, and <italic>max wait time</italic>, which is the maximum hospital computation time for a run+hub computation time.</p>
        <table-wrap position="float" id="table1">
          <label>Table 1</label>
          <caption>
            <p>Benchmark results for selected methods for queries matching 10,000 patients.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="30"/>
            <col width="210"/>
            <col width="0"/>
            <col width="170"/>
            <col width="0"/>
            <col width="190"/>
            <col width="0"/>
            <col width="70"/>
            <col width="0"/>
            <col width="70"/>
            <col width="0"/>
            <col width="100"/>
            <col width="0"/>
            <col width="160"/>
            <thead>
              <tr valign="top">
                <td colspan="2">Method and obfuscation</td>
                <td colspan="4">Estimated number of patients</td>
                <td colspan="4">Wait (seconds)</td>
                <td colspan="2">Risk:Hub</td>
                <td colspan="2">Risk: Hub+Site</td>
              </tr>
              <tr valign="top">
                <td colspan="2">
                  <break/>
                </td>
                <td colspan="2">Range of counts</td>
                <td colspan="2">Relative error (%)</td>
                <td colspan="2">Mean</td>
                <td colspan="2">Max</td>
                <td colspan="2">
                  <break/>
                </td>
                <td colspan="2">
                  <break/>
                </td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td colspan="14">
                  <bold>Count</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td colspan="2">None</td>
                <td colspan="2">899.9-19,470</td>
                <td colspan="2">–91 to 95</td>
                <td colspan="2">0</td>
                <td colspan="2">0</td>
                <td colspan="2">2.65</td>
                <td>2.65</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td colspan="2">Mask</td>
                <td colspan="2">899.9-19,477</td>
                <td colspan="2">–91 to 95</td>
                <td colspan="2">0</td>
                <td colspan="2">0</td>
                <td colspan="2">0</td>
                <td>0</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td colspan="2">MPC<sup>a</sup></td>
                <td colspan="2">18,886-19,470</td>
                <td colspan="2">89 to 95</td>
                <td colspan="2">0.099</td>
                <td colspan="2">0.099</td>
                <td colspan="2">0</td>
                <td>0</td>
              </tr>
              <tr valign="top">
                <td colspan="14">
                  <bold>HLL7<sup>b</sup></bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td colspan="2">None</td>
                <td colspan="2">8310-11,347</td>
                <td colspan="2">–17 to 13</td>
                <td colspan="2">0.006</td>
                <td colspan="2">0.006</td>
                <td colspan="2">15.73</td>
                <td>15.73</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td colspan="2">Shuffle</td>
                <td colspan="2">8310-11,347</td>
                <td colspan="2">–17 to 13</td>
                <td colspan="2">0.006</td>
                <td colspan="2">0.006</td>
                <td colspan="2">0.23</td>
                <td>15.73</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td colspan="2">Rehash</td>
                <td colspan="2">8310-11,347</td>
                <td colspan="2">–17 to 13</td>
                <td colspan="2">0.007</td>
                <td colspan="2">0.016</td>
                <td colspan="2">0</td>
                <td>15.73</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td colspan="2">Mask</td>
                <td colspan="2">7167-14,123</td>
                <td colspan="2">–28 to 41</td>
                <td colspan="2">0.005</td>
                <td colspan="2">0.005</td>
                <td colspan="2">0</td>
                <td>0</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td colspan="2">MPC</td>
                <td colspan="2">8310-11,347</td>
                <td colspan="2">–17 to 13</td>
                <td colspan="2">37.83</td>
                <td colspan="2">37.83</td>
                <td colspan="2">0.3</td>
                <td>0.3</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td colspan="2">Shuffle+MPC</td>
                <td colspan="2">8310-11,347</td>
                <td colspan="2">–17 to 13</td>
                <td colspan="2">37.83</td>
                <td colspan="2">37.83</td>
                <td colspan="2">0</td>
                <td>0.3</td>
              </tr>
              <tr valign="top">
                <td colspan="14">
                  <bold>HLL15</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td colspan="2">None</td>
                <td colspan="2">9928-10,075</td>
                <td colspan="2">–1 to 1</td>
                <td colspan="2">1.462</td>
                <td colspan="2">1.462</td>
                <td colspan="2">3707</td>
                <td>3707</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td colspan="2">Shuffle</td>
                <td colspan="2">9928-10,075</td>
                <td colspan="2">–1 to 1</td>
                <td colspan="2">1.462</td>
                <td colspan="2">1.462</td>
                <td colspan="2">0.23</td>
                <td>3707</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td colspan="2">Rehash</td>
                <td colspan="2">9928-10,075</td>
                <td colspan="2">–1 to 1</td>
                <td colspan="2">1.625</td>
                <td colspan="2">1.668</td>
                <td colspan="2">0</td>
                <td>3707</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td colspan="2">Mask</td>
                <td colspan="2">899.9-19,477</td>
                <td colspan="2">–91 to 95</td>
                <td colspan="2">0.012</td>
                <td colspan="2">0.012</td>
                <td colspan="2">0</td>
                <td>0</td>
              </tr>
              <tr valign="top">
                <td colspan="14">
                  <bold>HashedIDs</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td colspan="2">None</td>
                <td colspan="2">10,000-10,000</td>
                <td colspan="2">0 to 0</td>
                <td colspan="2">0.002</td>
                <td colspan="2">0.002</td>
                <td colspan="2">19,174</td>
                <td>19,174</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td colspan="2">Rehash</td>
                <td colspan="2">10,000-10,000</td>
                <td colspan="2">0 to 0</td>
                <td colspan="2">0.002</td>
                <td colspan="2">0.004</td>
                <td colspan="2">0</td>
                <td>19,174</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table1fn1">
              <p><sup>a</sup>MPC: multiparty computation.</p>
            </fn>
            <fn id="table1fn2">
              <p><sup>b</sup>HLL: HyperLogLog.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <p>As an example, in <xref ref-type="table" rid="table1">Table 1</xref>, for a query that actually matches 10,000 patients, the basic <italic>Count</italic> algorithm had an estimated count CI (using the summation for the upper estimate and maximum for the lower estimate) of 899.9 to 19,470 patients or a relative error of –91% to +95%. It also, on average, had 2.65 hospitals that returned potentially identifiable counts because the value was less than 10. This risk can be eliminated with <italic>Count+Mask</italic>, which increases the error, or by <italic>Count+MPC</italic>, which adds computational complexity, and only gives a single guess, instead of both upper and lower bounds. On the opposite extreme, <italic>HashedIDs</italic> returns the exact answer, but all 10,000 patients’ identities are at risk from a dictionary attack. (Note that <xref ref-type="table" rid="table1">Table 1</xref> lists the risk for <italic>HashedIDs</italic> at 19,174 because the same patient’s hash value can be returned by more than one hospital. We report the number of potentially identifiable values shared, not the number of unique patients at risk.) In <italic>HashedIDs+Rehash</italic>, the hub alone cannot identify patients from the hash values (the <italic>Risk:Hub</italic> column). However, the risk returns if an adversary can also obtain the secret random string from a hospital (the <italic>Risk:Hub+Site</italic> column).</p>
        <p><xref ref-type="table" rid="table1">Table 1</xref> shows that <italic>HLL7</italic> and <italic>HLL15</italic> can achieve a more tunable balance between accuracy and privacy. <italic>HLL7</italic> has a relative error of –17% to +13% (8310 to 11,347), which is considerably better than that of <italic>Count</italic>, and <italic>HLL15</italic> results in an even smaller relative error of –1% to 1% (9928 to 10,075). <italic>HLL7</italic> and <italic>HLL15</italic> generate, on average, 15.73 and 3707 potentially identifiable values. However, adding obfuscation with <italic>HLL+Shuffle</italic> adds essentially no additional computation time but reduces the risk to less than 1 (0.23 on average) potentially identifiable value. In other words, highly accurate estimates with only 1% error can be obtained with most queries having no risk of reidentification. Even if an adversary obtains the secret random string, the risk of 3707 is much less than 19,174 for <italic>HashedIDs</italic>.</p>
      </sec>
      <sec>
        <title>Graphical Comparison of Algorithms</title>
        <p><xref rid="figure4" ref-type="fig">Figure 4</xref> graphically illustrates the accuracy (the horizontal axis) and risk (the vertical axis) trade-off of the different algorithms. For simplicity, only the upper bound of the relative error is used for accuracy. (The lower bound and absolute errors are not shown.) Although an individual simulation is plotted as a single point in the figure, algorithms are shown as regions because changing the input parameters to the simulation affects the results. For example, the blue region in <xref rid="figure4" ref-type="fig">Figure 4</xref> covers the range of HLLs with queries of different sizes (10 to 10 million matching patients) and sketches of different sizes (HLL1=2 to HLL15=32,768 values).</p>
        <fig id="figure4" position="float">
          <label>Figure 4</label>
          <caption>
            <p>Comparison of the query accuracy/privacy risk trade-off based on the simulations of a network with 100 sites and 100 million patients. HashedIDs and Count bound the graph, whereas HLL-based methods enable a more balanced approach. (HLL+MPC is only shown for 10 million patients, and the values for HLL7+MPC and HLL15+MPC are theoretical rather than experimental.) HLL+MPC reduces the HLL risk by 1/s, where s is the number of sites in the network. HLL+Shuffle reduces the HLL risk by 1/t, where t is the number of values in the HLL sketch. HLL: HyperLogLog; MPC: multiparty computation.</p>
          </caption>
          <graphic xlink:href="jmir_v22i11e18735_fig4.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <p>The key takeaway from <xref rid="figure4" ref-type="fig">Figure 4</xref> is that <italic>Count</italic> and <italic>HashedIDs</italic> are extremes that cover only one axis or the other, whereas variations of <italic>HLL</italic> enable networks to select an algorithm that fits anywhere between the axes. In other words, with <italic>HLL</italic>, networks can determine an acceptable risk level and pick the sketch size and obfuscation method that will give the most accurate result. Alternatively, they can start with a desired accuracy and pick the most secure method that runs within a given amount of time.</p>
        <p><italic>Count+Mask</italic> has the worst accuracy but guarantees 10-anonymity (thin horizontal gray box; <xref rid="figure4" ref-type="fig">Figure 4</xref>). As each patient in the simulation was, on average, at two hospitals, queries that matched all 100 million distinct patients returned counts from each hospital that added up to 200 million—a 100% overestimate. Queries that only matched a few patients (<italic>small queries</italic>) had much greater error because of the obfuscation. The worst case, in theory, is when a query matches one distinct patient and that patient happens to be at each of the 100 hospitals. As each hospital returns <italic>≤10</italic>, the upper bound estimate assumes that there are 10 patients in each hospital and that there is no overlap. This would result in an upper bound estimate of 100×10=1000 or a relative error of 99,900%. Even when patients are only at one hospital (no overlap), <italic>Count+Mask</italic> can have a 900% error.</p>
        <p>Without obfuscation, the relative error of <italic>Count</italic> in the simulations remained near 100% for queries of all sizes (thin vertical gray box; <xref rid="figure4" ref-type="fig">Figure 4</xref>). However, for small queries, many sites returned potentially identifiable counts less than 10. At the other extreme, <italic>HashedIDs</italic> always gave correct answers (0% relative error). However, this requires sharing individual data on all matching patients (thin vertical brown box; <xref rid="figure4" ref-type="fig">Figure 4</xref>). The risk can be reduced if a different hash function is used for each query (<italic>HashedIDs+Rehash</italic>) and an adversary is unable to discover the hash functions.</p>
        <p>Variations of <italic>HLL</italic> fill in the space between <italic>Count</italic>, <italic>Count+Mash</italic>, and <italic>HashedIDs</italic>, allowing the networks to tune their estimation method to achieve a more desirable balance of accuracy and risk for a given application. In <xref rid="figure4" ref-type="fig">Figure 4</xref>, <italic>HLL</italic> (the blue region), <italic>HLL+Shuffle</italic> (the red region), and <italic>HLL+Rehash</italic> (the thin horizontal green box) have the same accuracy but different levels of risk. In contrast to <italic>Count</italic>, which has more risk with smaller queries, <italic>HLL</italic>, like <italic>HashedIDs</italic>, has a higher risk with larger queries. Doubling the number of buckets in the <italic>HLL</italic> sketch reduces the error by a factor of sqrt(2); however, without obfuscation, it also doubles the risk.</p>
        <p>The benefit of <italic>HLL+Shuffle</italic> is that buckets can be added to reduce error with only minimal change in risk. For queries that matched fewer than 100,000 patients, even <italic>HLL15+Shuffle</italic>, which has a relative error of only approximately 1%, had an average privacy risk of less than 1<italic>. HLL+Rehash</italic> reduced risk even further but required over a minute of extra computational time in some experiments, whereas the computational time of HLL+Shuffle is negligible. <italic>HLL+Mask</italic> guarantees 10-anonymity, but its error was often almost as large as <italic>Count+Mask</italic>. The benefit of <italic>HLL+Mask</italic> is that it can leverage the improved accuracy of HLL when possible, while ensuring that no added risk is introduced.</p>
      </sec>
      <sec>
        <title>Qualitative Comparison of the Algorithms</title>
        <p><xref ref-type="table" rid="table2">Table 2</xref> provides a qualitative summary of the results. In general, HLL, especially with obfuscation, is much more accurate than aggregate counts, lower risk than sharing hash values of all matching patients, and more scalable than privacy guaranteeing algorithms. The relevant benefits of certain methods depend on the number of patients who match the query. For example, as the number of patients increases, the risk of <italic>Count</italic> decreases, as indicated by “(–)”, while the risk of <italic>HLL7</italic> increases, as indicated by “(+).”</p>
        <table-wrap position="float" id="table2">
          <label>Table 2</label>
          <caption>
            <p>Qualitative comparison of algorithms.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="30"/>
            <col width="260"/>
            <col width="220"/>
            <col width="150"/>
            <col width="170"/>
            <col width="170"/>
            <thead>
              <tr valign="top">
                <td colspan="2">Method and obfuscation</td>
                <td>Approximation error</td>
                <td>Runtime wait</td>
                <td>Risk:Hub</td>
                <td>Risk:Hub+Site</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td colspan="6">
                  <bold>Count</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>None</td>
                <td>Large</td>
                <td>Very small</td>
                <td>Medium (–)</td>
                <td>Medium (–)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Mask</td>
                <td>Large</td>
                <td>Very small</td>
                <td>Zero</td>
                <td>Zero</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>MPC<sup>a</sup></td>
                <td>No change<sup>b</sup></td>
                <td>Medium</td>
                <td>Zero</td>
                <td>Zero</td>
              </tr>
              <tr valign="top">
                <td colspan="6">
                  <bold>HLL7<sup>c</sup></bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>None</td>
                <td>Medium</td>
                <td>Small</td>
                <td>Medium (+)</td>
                <td>Medium (+)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Shuffle</td>
                <td>No change</td>
                <td>No change</td>
                <td>Small (+)</td>
                <td>No change</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Rehash</td>
                <td>No change</td>
                <td>Medium (+)</td>
                <td>Zero</td>
                <td>No change</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Mask</td>
                <td>Medium (+)</td>
                <td>Medium (–)</td>
                <td>Zero</td>
                <td>Zero</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>MPC</td>
                <td>No change</td>
                <td>Large</td>
                <td>Small (+)</td>
                <td>Small (+)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Shuffle+MPC</td>
                <td>No change</td>
                <td>HLL7+MPC</td>
                <td>Very small (+)</td>
                <td>HLL7+MPC</td>
              </tr>
              <tr valign="top">
                <td colspan="6">
                  <bold>HLL15</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>None</td>
                <td>Small</td>
                <td>Medium</td>
                <td>Large (+)</td>
                <td>Large (+)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Shuffle</td>
                <td>No change</td>
                <td>No change</td>
                <td>Small (+)</td>
                <td>No change</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Rehash</td>
                <td>No change</td>
                <td>Medium (+)</td>
                <td>Zero</td>
                <td>No change</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Mask</td>
                <td>Large (+)</td>
                <td>Medium (–)</td>
                <td>Zero</td>
                <td>Zero</td>
              </tr>
              <tr valign="top">
                <td colspan="6">
                  <bold>HashedIDs</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>None</td>
                <td>Zero</td>
                <td>Medium (+)</td>
                <td>Very large (+)</td>
                <td>Very large (+)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Rehash</td>
                <td>No change</td>
                <td>No change</td>
                <td>Zero</td>
                <td>No change</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table2fn1">
              <p><sup>a</sup>MPC: multiparty computation.</p>
            </fn>
            <fn id="table2fn2">
              <p><sup>b</sup>No change: the value is the same as the method without any obfuscation.</p>
            </fn>
            <fn id="table2fn3">
              <p><sup>c</sup>HLL: HyperLogLog.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
      </sec>
      <sec>
        <title>Computational and Communication Costs</title>
        <p><xref ref-type="supplementary-material" rid="app3">Multimedia Appendix 3</xref> shows the theoretical upper bounds on the computational costs of each method plus obfuscation technique, theoretical exact communication costs (the space complexity of the amount of data that the hospitals and hub have to send over the network), and the actual empirical results of both computational and communication costs.</p>
      </sec>
    </sec>
    <sec sec-type="discussion">
      <title>Discussion</title>
      <sec>
        <title>Summary of Results and Practical Considerations</title>
        <p>In this study, we surveyed and benchmarked a range of methods for determining the number of distinct patients who matched a federated query, exploring the trade-offs in accuracy, privacy, and speed. We explicitly do not endorse a single one-size-fits-all method because different networks and institutions will have different needs. With data use agreements and a trusted third party, <italic>HashedIDs</italic> provides the most accurate results. When minimizing privacy risk is the most important factor, networks can choose between (1) fast but inaccurate methods such as <italic>Count+Mask</italic>, (2) accurate but slow algorithms such as <italic>HLL+Rehash</italic>, or (3) privacy-guaranteed methods that only work on small networks. A key goal of the ACT network is <italic>real-time</italic> queries that enable rapid exploration of the data. As a result, adding even a few seconds of computational time to ACT queries might not be acceptable. When runtimes must be minimized, methods such as <italic>HLL7+Mask</italic> and <italic>HLL7+Shuffle</italic> are fast and have a good balance between accuracy and privacy.</p>
        <p>In practice, we envision a combination approach. Queries can first be run using a fast, private method, such as <italic>Count+Mask</italic> or <italic>Count+MPC</italic>. Given these rough results and the needs of the researcher, hospitals can then be asked to return the HLL sketches for the patients who matched the query. The initial count estimate and the privacy risk allowed by the network could be used to select the HLL sketch size and obfuscation method that would return the most accurate result in a reasonable amount of time. In the final stage of research (eg, in preparation for a full clinical trial), investigators could request permission from institutions to run accurate but potentially identifiable queries, such as <italic>HLL15</italic> or <italic>HashedIDs</italic>.</p>
      </sec>
      <sec>
        <title>Limitations</title>
        <p>It is important for each institution to assess their own risk models. In particular, our risk model assumes that given a sketch for a given condition (eg, hypertension), the adversary already has access to the list of patients at the hospital and wants to identify patients that have the condition. The filled buckets of an HLL sketch correspond to hashes of patients who have the condition, and our goal is to ensure that for every patient with the condition, at least nine other patients without that condition could have hashed to the same value, ensuring 10-anonymity. Statistics that do not meet this requirement count for the privacy loss score. For example, our privacy risk analysis differs considerably from that of Desfontaines et al [<xref ref-type="bibr" rid="ref27">27</xref>] who argue that “cardinality estimators do not preserve privacy.” However, their threat model assumes that an adversary can access the sketches as they are being generated, one patient at a time. In contrast, our risk model is based on each hospital’s final sketch, which represents all patients who match the query.</p>
        <p>In addition, some amount of information is leaked about the patients <italic>not</italic> included in the sketch, precisely because they were not included. This does not allow an adversary to pinpoint patients with a condition but may sometimes allow them to determine a patient lacking that condition. Of course, this type of leakage is to some extent a problem with any aggregate query system, because if an adversary learns that only 1% of patients at a hospital have a condition, then they know with high certainty that most patients do not. In line with our analysis mentioned earlier, however, for this type of leakage, <italic>Count</italic> is more private than <italic>HLL</italic>, which is more private than <italic>HashedIDs</italic>, so the same privacy-accuracy trade-off applies.</p>
        <p>We only considered a federated or distributed network in which no patient-level clinical data leave the institution and queries only return aggregate counts. This is in contrast to privacy-preserving record linkage approaches whose goal is to assemble a centralized deduplicated limited or deidentified data set through an honest broker without exchanging identifiable information. With the appropriate technologies, a secure infrastructure, and the proper institutional agreements in place, it is possible to merge data sets, even on large scales. PCORNet, in particular, has used methods similar to <italic>HashedIDs</italic> and <italic>HashedIDs+Rehash</italic> to do this for subsets of hospitals in its network [<xref ref-type="bibr" rid="ref28">28</xref>,<xref ref-type="bibr" rid="ref29">29</xref>]. There are multiple advantages of centralized data, including exact results and ease of use. However, in this study, we showed that (1) linking and deduplicating data at the individual patient level is not necessary to obtain accurate estimates and (2) this can be done in a computationally efficient manner. There are benefits to this federated model. It reduces concerns that hospitals might have in sharing data, it does not require updating and relinking the central database, and it places less dependency on having an honest broker.</p>
      </sec>
      <sec>
        <title>Conclusions</title>
        <p>We believe that as federated data networks expand to include more institutions and data types (clinical, genomic, environmental, etc), researchers will increasingly depend on fast, accurate, and secure query tools to obtain the greatest possible scientific value from the networks. However, because no single algorithm meets all these requirements, having the ability to select among different methods for a particular application is essential. In this study, we introduce <italic>HLL</italic> and several obfuscation techniques to provide networks with a tunable approach to determine the number of distinct patients who match a query, which is more balanced than commonly used methods that greatly sacrifice accuracy (<italic>Count+Mask</italic>), privacy (<italic>HashedIDs</italic>), or scalability.</p>
      </sec>
    </sec>
  </body>
  <back>
    <app-group>
      <supplementary-material id="app1">
        <label>Multimedia Appendix 1</label>
        <p>Details on the algorithms, secure methods that are not scalable to large networks, the privacy risk score, and the federated hospital network simulation.</p>
        <media xlink:href="jmir_v22i11e18735_app1.pdf" xlink:title="PDF File  (Adobe PDF File), 149 KB"/>
      </supplementary-material>
      <supplementary-material id="app2">
        <label>Multimedia Appendix 2</label>
        <p>Detailed benchmark results.</p>
        <media xlink:href="jmir_v22i11e18735_app2.pdf" xlink:title="PDF File  (Adobe PDF File), 156 KB"/>
      </supplementary-material>
      <supplementary-material id="app3">
        <label>Multimedia Appendix 3</label>
        <p>Time and space complexity for various methods.</p>
        <media xlink:href="jmir_v22i11e18735_app3.pdf" xlink:title="PDF File  (Adobe PDF File), 118 KB"/>
      </supplementary-material>
    </app-group>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">ACT</term>
          <def>
            <p>Accrual to Clinical Trials</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb2">HLL</term>
          <def>
            <p>HyperLogLog</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb3">MPC</term>
          <def>
            <p>multiparty computation</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb4">NIH</term>
          <def>
            <p>National Institutes of Health</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb5">PCORnet</term>
          <def>
            <p>Patient-Centered Outcomes Research Network</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <ack>
      <p>This study was supported by the NIH Big Data to Knowledge Award U54HG007963 from the National Human Genome Research Institute, U01CA198934 from the National Cancer Institute, and R01LM013345 from the National Library of Medicine. YY was also supported by a training grant T15LM007092 from the NIH National Library of Medicine.</p>
    </ack>
    <fn-group>
      <fn fn-type="conflict">
        <p>None declared.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Jensen</surname>
              <given-names>PB</given-names>
            </name>
            <name name-style="western">
              <surname>Jensen</surname>
              <given-names>LJ</given-names>
            </name>
            <name name-style="western">
              <surname>Brunak</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Mining electronic health records: towards better research applications and clinical care</article-title>
          <source>Nat Rev Genet</source>
          <year>2012</year>
          <month>05</month>
          <day>2</day>
          <volume>13</volume>
          <issue>6</issue>
          <fpage>395</fpage>
          <lpage>405</lpage>
          <pub-id pub-id-type="doi">10.1038/nrg3208</pub-id>
          <pub-id pub-id-type="medline">22549152</pub-id>
          <pub-id pub-id-type="pii">nrg3208</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Fleurence</surname>
              <given-names>RL</given-names>
            </name>
            <name name-style="western">
              <surname>Curtis</surname>
              <given-names>LH</given-names>
            </name>
            <name name-style="western">
              <surname>Califf</surname>
              <given-names>RM</given-names>
            </name>
            <name name-style="western">
              <surname>Platt</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Selby</surname>
              <given-names>JV</given-names>
            </name>
            <name name-style="western">
              <surname>Brown</surname>
              <given-names>JS</given-names>
            </name>
          </person-group>
          <article-title>Launching PCORnet, a national patient-centered clinical research network</article-title>
          <source>J Am Med Inform Assoc</source>
          <year>2014</year>
          <volume>21</volume>
          <issue>4</issue>
          <fpage>578</fpage>
          <lpage>82</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/24821743"/>
          </comment>
          <pub-id pub-id-type="doi">10.1136/amiajnl-2014-002747</pub-id>
          <pub-id pub-id-type="medline">24821743</pub-id>
          <pub-id pub-id-type="pii">amiajnl-2014-002747</pub-id>
          <pub-id pub-id-type="pmcid">PMC4078292</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Weber</surname>
              <given-names>GM</given-names>
            </name>
            <name name-style="western">
              <surname>Murphy</surname>
              <given-names>SN</given-names>
            </name>
            <name name-style="western">
              <surname>McMurry</surname>
              <given-names>AJ</given-names>
            </name>
            <name name-style="western">
              <surname>Macfadden</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Nigrin</surname>
              <given-names>DJ</given-names>
            </name>
            <name name-style="western">
              <surname>Churchill</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Kohane</surname>
              <given-names>IS</given-names>
            </name>
          </person-group>
          <article-title>The shared health research information hetwork (SHRINE): a prototype federated query tool for clinical data repositories</article-title>
          <source>J Am Med Inform Assoc</source>
          <year>2009</year>
          <volume>16</volume>
          <issue>5</issue>
          <fpage>624</fpage>
          <lpage>30</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/19567788"/>
          </comment>
          <pub-id pub-id-type="doi">10.1197/jamia.M3191</pub-id>
          <pub-id pub-id-type="medline">19567788</pub-id>
          <pub-id pub-id-type="pii">M3191</pub-id>
          <pub-id pub-id-type="pmcid">PMC2744712</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>McMurry</surname>
              <given-names>AJ</given-names>
            </name>
            <name name-style="western">
              <surname>Murphy</surname>
              <given-names>SN</given-names>
            </name>
            <name name-style="western">
              <surname>MacFadden</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Weber</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Simons</surname>
              <given-names>WW</given-names>
            </name>
            <name name-style="western">
              <surname>Orechia</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Bickel</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Wattanasin</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Gilbert</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Trevvett</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Churchill</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Kohane</surname>
              <given-names>IS</given-names>
            </name>
          </person-group>
          <article-title>SHRINE: enabling nationally scalable multi-site disease studies</article-title>
          <source>PLoS One</source>
          <year>2013</year>
          <volume>8</volume>
          <issue>3</issue>
          <fpage>e55811</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://dx.plos.org/10.1371/journal.pone.0055811"/>
          </comment>
          <pub-id pub-id-type="doi">10.1371/journal.pone.0055811</pub-id>
          <pub-id pub-id-type="medline">23533569</pub-id>
          <pub-id pub-id-type="pii">PONE-D-12-13223</pub-id>
          <pub-id pub-id-type="pmcid">PMC3591385</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Visweswaran</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Becich</surname>
              <given-names>MJ</given-names>
            </name>
            <name name-style="western">
              <surname>D'Itri</surname>
              <given-names>VS</given-names>
            </name>
            <name name-style="western">
              <surname>Sendro</surname>
              <given-names>ER</given-names>
            </name>
            <name name-style="western">
              <surname>MacFadden</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Anderson</surname>
              <given-names>NR</given-names>
            </name>
            <name name-style="western">
              <surname>Allen</surname>
              <given-names>KA</given-names>
            </name>
            <name name-style="western">
              <surname>Ranganathan</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Murphy</surname>
              <given-names>SN</given-names>
            </name>
            <name name-style="western">
              <surname>Morrato</surname>
              <given-names>EH</given-names>
            </name>
            <name name-style="western">
              <surname>Pincus</surname>
              <given-names>HA</given-names>
            </name>
            <name name-style="western">
              <surname>Toto</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Firestein</surname>
              <given-names>GS</given-names>
            </name>
            <name name-style="western">
              <surname>Nadler</surname>
              <given-names>LM</given-names>
            </name>
            <name name-style="western">
              <surname>Reis</surname>
              <given-names>SE</given-names>
            </name>
          </person-group>
          <article-title>Accrual to clinical trials (ACT): a clinical and translational science award consortium network</article-title>
          <source>JAMIA Open</source>
          <year>2018</year>
          <month>10</month>
          <volume>1</volume>
          <issue>2</issue>
          <fpage>147</fpage>
          <lpage>52</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/30474072"/>
          </comment>
          <pub-id pub-id-type="doi">10.1093/jamiaopen/ooy033</pub-id>
          <pub-id pub-id-type="medline">30474072</pub-id>
          <pub-id pub-id-type="pii">ooy033</pub-id>
          <pub-id pub-id-type="pmcid">PMC6241502</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref6">
        <label>6</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Weber</surname>
              <given-names>GM</given-names>
            </name>
          </person-group>
          <article-title>Federated queries of clinical data repositories: the sum of the parts does not equal the whole</article-title>
          <source>J Am Med Inform Assoc</source>
          <year>2013</year>
          <month>06</month>
          <volume>20</volume>
          <issue>e1</issue>
          <fpage>e155</fpage>
          <lpage>61</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/23349080"/>
          </comment>
          <pub-id pub-id-type="doi">10.1136/amiajnl-2012-001299</pub-id>
          <pub-id pub-id-type="medline">23349080</pub-id>
          <pub-id pub-id-type="pii">amiajnl-2012-001299</pub-id>
          <pub-id pub-id-type="pmcid">PMC3715334</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref7">
        <label>7</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Grannis</surname>
              <given-names>SJ</given-names>
            </name>
            <name name-style="western">
              <surname>Overhage</surname>
              <given-names>JM</given-names>
            </name>
            <name name-style="western">
              <surname>McDonald</surname>
              <given-names>CJ</given-names>
            </name>
          </person-group>
          <article-title>Analysis of identifier performance using a deterministic linkage algorithm</article-title>
          <source>Proc AMIA Symp</source>
          <year>2002</year>
          <fpage>305</fpage>
          <lpage>9</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/12463836"/>
          </comment>
          <pub-id pub-id-type="medline">12463836</pub-id>
          <pub-id pub-id-type="pii">D020002425</pub-id>
          <pub-id pub-id-type="pmcid">PMC2244404</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref8">
        <label>8</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Eastlake</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Jones</surname>
              <given-names>P</given-names>
            </name>
          </person-group>
          <article-title>US Secure Hash Algorithm 1 (SHA1)</article-title>
          <source>IETF Tools</source>
          <access-date>2020-10-23</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://tools.ietf.org/html/rfc3174">https://tools.ietf.org/html/rfc3174</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref9">
        <label>9</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Oechslin</surname>
              <given-names>P</given-names>
            </name>
          </person-group>
          <article-title>Making a Faster Cryptanalytic Time-memory Trade-Off</article-title>
          <source>Annual International Cryptology Conference</source>
          <year>2003</year>
          <conf-name>CRYPTO'03</conf-name>
          <conf-date>August 17-21, 2003</conf-date>
          <conf-loc>Santa Barbara, CA, USA</conf-loc>
          <pub-id pub-id-type="doi">10.1007/978-3-540-45146-4_36</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref10">
        <label>10</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Cho</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>DJ</given-names>
            </name>
            <name name-style="western">
              <surname>Berger</surname>
              <given-names>B</given-names>
            </name>
          </person-group>
          <article-title>Secure genome-wide association analysis using multiparty computation</article-title>
          <source>Nat Biotechnol</source>
          <year>2018</year>
          <month>07</month>
          <volume>36</volume>
          <issue>6</issue>
          <fpage>547</fpage>
          <lpage>51</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/29734293"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/nbt.4108</pub-id>
          <pub-id pub-id-type="medline">29734293</pub-id>
          <pub-id pub-id-type="pii">nbt.4108</pub-id>
          <pub-id pub-id-type="pmcid">PMC5990440</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref11">
        <label>11</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hie</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Cho</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Berger</surname>
              <given-names>B</given-names>
            </name>
          </person-group>
          <article-title>Realizing private and practical pharmacological collaboration</article-title>
          <source>Science</source>
          <year>2018</year>
          <month>10</month>
          <day>19</day>
          <volume>362</volume>
          <issue>6412</issue>
          <fpage>347</fpage>
          <lpage>50</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/30337410"/>
          </comment>
          <pub-id pub-id-type="doi">10.1126/science.aat4807</pub-id>
          <pub-id pub-id-type="medline">30337410</pub-id>
          <pub-id pub-id-type="pii">362/6412/347</pub-id>
          <pub-id pub-id-type="pmcid">PMC6519716</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref12">
        <label>12</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kolesnikov</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Matania</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Pinkas</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Rosulek</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Trieu</surname>
              <given-names>N</given-names>
            </name>
          </person-group>
          <article-title>Practical Multi-Party Private Set Intersection From Symmetric-Key Techniques</article-title>
          <source>Proceedings of the 2017 ACM SIGSAC Conference on Computer and Communications Security</source>
          <year>2017</year>
          <conf-name>CCS'17</conf-name>
          <conf-date>October 30-November 3, 2017</conf-date>
          <conf-loc>Dallas, Texas, USA</conf-loc>
          <pub-id pub-id-type="doi">10.1145/3133956.3134065</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref13">
        <label>13</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>de Cristofaro</surname>
              <given-names>CE</given-names>
            </name>
            <name name-style="western">
              <surname>Gasti</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Tsudik</surname>
              <given-names>G</given-names>
            </name>
          </person-group>
          <article-title>Fast and Private Computation of Cardinality of Set Intersection and Union</article-title>
          <source>International Conference on Cryptology and Network Security</source>
          <year>2012</year>
          <conf-name>CANS'12</conf-name>
          <conf-date>December 12-14, 2012</conf-date>
          <conf-loc>Darmstadt, Germany</conf-loc>
          <pub-id pub-id-type="doi">10.1007/978-3-642-35404-5_17</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref14">
        <label>14</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Swamidass</surname>
              <given-names>SJ</given-names>
            </name>
            <name name-style="western">
              <surname>Matlock</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Rozenblit</surname>
              <given-names>L</given-names>
            </name>
          </person-group>
          <article-title>Securely measuring the overlap between private datasets with cryptosets</article-title>
          <source>PLoS One</source>
          <year>2015</year>
          <volume>10</volume>
          <issue>2</issue>
          <fpage>e0117898</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://dx.plos.org/10.1371/journal.pone.0117898"/>
          </comment>
          <pub-id pub-id-type="doi">10.1371/journal.pone.0117898</pub-id>
          <pub-id pub-id-type="medline">25714898</pub-id>
          <pub-id pub-id-type="pii">PONE-D-14-22319</pub-id>
          <pub-id pub-id-type="pmcid">PMC4340911</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref15">
        <label>15</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Fenske</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Mani</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Johnson</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Sherr</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Distributed Measurement With Private Set-Union Cardinality</article-title>
          <source>Proceedings of the 2017 ACM SIGSAC Conference on Computer and Communications Security</source>
          <year>2017</year>
          <conf-name>CCS'17</conf-name>
          <conf-date>October 30-November 3, 2017</conf-date>
          <conf-loc>Dallas, Texas, USA</conf-loc>
          <pub-id pub-id-type="doi">10.1145/3133956.3134034</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref16">
        <label>16</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Dong</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Loukides</surname>
              <given-names>G</given-names>
            </name>
          </person-group>
          <article-title>Approximating private set union/intersection cardinality with logarithmic complexity</article-title>
          <source>IEEE Trans Inform Forensic Secur</source>
          <year>2017</year>
          <month>11</month>
          <volume>12</volume>
          <issue>11</issue>
          <fpage>2792</fpage>
          <lpage>806</lpage>
          <pub-id pub-id-type="doi">10.1109/tifs.2017.2721360</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref17">
        <label>17</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Yigzaw</surname>
              <given-names>KY</given-names>
            </name>
            <name name-style="western">
              <surname>Michalas</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Bellika</surname>
              <given-names>JG</given-names>
            </name>
          </person-group>
          <article-title>Secure and scalable deduplication of horizontally partitioned health data for privacy-preserving distributed statistical computation</article-title>
          <source>BMC Med Inform Decis Mak</source>
          <year>2017</year>
          <month>01</month>
          <day>3</day>
          <volume>17</volume>
          <issue>1</issue>
          <fpage>1</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://bmcmedinformdecismak.biomedcentral.com/articles/10.1186/s12911-016-0389-x"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/s12911-016-0389-x</pub-id>
          <pub-id pub-id-type="medline">28049465</pub-id>
          <pub-id pub-id-type="pii">10.1186/s12911-016-0389-x</pub-id>
          <pub-id pub-id-type="pmcid">PMC5209873</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref18">
        <label>18</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Weber</surname>
              <given-names>GM</given-names>
            </name>
          </person-group>
          <article-title>Federated queries of clinical data repositories: scaling to a national network</article-title>
          <source>J Biomed Inform</source>
          <year>2015</year>
          <month>06</month>
          <volume>55</volume>
          <fpage>231</fpage>
          <lpage>6</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://linkinghub.elsevier.com/retrieve/pii/S1532-0464(15)00079-9"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.jbi.2015.04.012</pub-id>
          <pub-id pub-id-type="medline">25957825</pub-id>
          <pub-id pub-id-type="pii">S1532-0464(15)00079-9</pub-id>
          <pub-id pub-id-type="pmcid">PMC4464929</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref19">
        <label>19</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Flajolet</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Fusy</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Gandouet</surname>
              <given-names>O</given-names>
            </name>
            <name name-style="western">
              <surname>Meunier</surname>
              <given-names>F</given-names>
            </name>
          </person-group>
          <article-title>Hyperloglog: the Analysis of a Near-Optimal Cardinality Estimation Algorithm</article-title>
          <source>Conference on Analysis of Algorithms</source>
          <year>2007</year>
          <conf-name>AofA'07</conf-name>
          <conf-date>January 1, 2007</conf-date>
          <conf-loc>Juan des Pins, France</conf-loc>
          <fpage>1</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://algo.inria.fr/flajolet/Publications/FlFuGaMe07.pdf"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref20">
        <label>20</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Bar-Yossef</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Jayram</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Kumar</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Sivakumar</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Trevisan</surname>
              <given-names>L</given-names>
            </name>
          </person-group>
          <article-title>Counting Distinct Elements in a Data Stream</article-title>
          <source>International Workshop on Randomization and Approximation Techniques in Computer Science</source>
          <year>2002</year>
          <conf-name>RANDOM'02</conf-name>
          <conf-date>September 13-15, 2002</conf-date>
          <conf-loc>Cambridge, MA, USA</conf-loc>
          <pub-id pub-id-type="doi">10.1007/3-540-45726-7_1</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref21">
        <label>21</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Elgamal</surname>
              <given-names>T</given-names>
            </name>
          </person-group>
          <article-title>A public key cryptosystem and a signature scheme based on discrete logarithms</article-title>
          <source>IEEE Trans Inform Theory</source>
          <year>1985</year>
          <month>07</month>
          <volume>31</volume>
          <issue>4</issue>
          <fpage>469</fpage>
          <lpage>72</lpage>
          <pub-id pub-id-type="doi">10.1109/tit.1985.1057074</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref22">
        <label>22</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>El Emam</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Dankar</surname>
              <given-names>F</given-names>
            </name>
          </person-group>
          <article-title>Protecting privacy using k-anonymity</article-title>
          <source>J Am Med Inform Assoc</source>
          <year>2008</year>
          <volume>15</volume>
          <issue>5</issue>
          <fpage>627</fpage>
          <lpage>37</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/18579830"/>
          </comment>
          <pub-id pub-id-type="doi">10.1197/jamia.M2716</pub-id>
          <pub-id pub-id-type="medline">18579830</pub-id>
          <pub-id pub-id-type="pii">M2716</pub-id>
          <pub-id pub-id-type="pmcid">PMC2528029</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref23">
        <label>23</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Jakobsson</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Juels</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Mix and Match: Secure Function Evaluation via Ciphertexts</article-title>
          <source>International Conference on the Theory and Application of Cryptology and Information Security</source>
          <year>2000</year>
          <conf-name>ASIACRYPT'00</conf-name>
          <conf-date>December 3-7, 2000</conf-date>
          <conf-loc>Kyoto, Japan</conf-loc>
          <pub-id pub-id-type="doi">10.1007/3-540-44448-3_13</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref24">
        <label>24</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Berry</surname>
              <given-names>BJ</given-names>
            </name>
          </person-group>
          <article-title>City size distributions and economic development</article-title>
          <source>Econ Dev Cult Change</source>
          <year>1961</year>
          <month>07</month>
          <volume>9</volume>
          <issue>4, Part 1</issue>
          <fpage>573</fpage>
          <lpage>88</lpage>
          <pub-id pub-id-type="doi">10.1086/449923</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref25">
        <label>25</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Sweeney</surname>
              <given-names>L</given-names>
            </name>
          </person-group>
          <article-title>K-anonymity: a model for protecting privacy</article-title>
          <source>Int J Unc Fuzz Knowl Based Syst</source>
          <year>2012</year>
          <month>05</month>
          <day>2</day>
          <volume>10</volume>
          <issue>5</issue>
          <fpage>557</fpage>
          <lpage>70</lpage>
          <pub-id pub-id-type="doi">10.1142/s0218488502001648</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref26">
        <label>26</label>
        <nlm-citation citation-type="web">
          <article-title>yunwilliamyu / secure-distributed-union-cardinality</article-title>
          <source>GitHub</source>
          <access-date>2020-10-23</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://github.com/yunwilliamyu/secure-distributed-union-cardinality">https://github.com/yunwilliamyu/secure-distributed-union-cardinality</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref27">
        <label>27</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Desfontaines</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Lochbihler</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Basin</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>Cardinality estimators do not preserve privacy</article-title>
          <source>ArXiv</source>
          <year>2020</year>
          <fpage>-</fpage>
          <comment>epub ahead of print</comment>
          <pub-id pub-id-type="doi">10.2478/popets-2019-0018</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref28">
        <label>28</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kho</surname>
              <given-names>AN</given-names>
            </name>
            <name name-style="western">
              <surname>Cashy</surname>
              <given-names>JP</given-names>
            </name>
            <name name-style="western">
              <surname>Jackson</surname>
              <given-names>KL</given-names>
            </name>
            <name name-style="western">
              <surname>Pah</surname>
              <given-names>AR</given-names>
            </name>
            <name name-style="western">
              <surname>Goel</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Boehnke</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Humphries</surname>
              <given-names>JE</given-names>
            </name>
            <name name-style="western">
              <surname>Kominers</surname>
              <given-names>SD</given-names>
            </name>
            <name name-style="western">
              <surname>Hota</surname>
              <given-names>BN</given-names>
            </name>
            <name name-style="western">
              <surname>Sims</surname>
              <given-names>SA</given-names>
            </name>
            <name name-style="western">
              <surname>Malin</surname>
              <given-names>BA</given-names>
            </name>
            <name name-style="western">
              <surname>French</surname>
              <given-names>DD</given-names>
            </name>
            <name name-style="western">
              <surname>Walunas</surname>
              <given-names>TL</given-names>
            </name>
            <name name-style="western">
              <surname>Meltzer</surname>
              <given-names>DO</given-names>
            </name>
            <name name-style="western">
              <surname>Kaleba</surname>
              <given-names>EO</given-names>
            </name>
            <name name-style="western">
              <surname>Jones</surname>
              <given-names>RC</given-names>
            </name>
            <name name-style="western">
              <surname>Galanter</surname>
              <given-names>WL</given-names>
            </name>
          </person-group>
          <article-title>Design and implementation of a privacy preserving electronic health record linkage tool in Chicago</article-title>
          <source>J Am Med Inform Assoc</source>
          <year>2015</year>
          <month>09</month>
          <volume>22</volume>
          <issue>5</issue>
          <fpage>1072</fpage>
          <lpage>80</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/26104741"/>
          </comment>
          <pub-id pub-id-type="doi">10.1093/jamia/ocv038</pub-id>
          <pub-id pub-id-type="medline">26104741</pub-id>
          <pub-id pub-id-type="pii">ocv038</pub-id>
          <pub-id pub-id-type="pmcid">PMC5009931</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref29">
        <label>29</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Bian</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Loiacono</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Sura</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Mendoza</surname>
              <given-names>VT</given-names>
            </name>
            <name name-style="western">
              <surname>Lipori</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Guo</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Shenkman</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Hogan</surname>
              <given-names>W</given-names>
            </name>
          </person-group>
          <article-title>Implementing a hash-based privacy-preserving record linkage tool in the oneFlorida clinical research network</article-title>
          <source>JAMIA Open Sep</source>
          <year>2019</year>
          <volume>2</volume>
          <issue>4</issue>
          <fpage>562</fpage>
          <lpage>9</lpage>
          <pub-id pub-id-type="doi">10.1093/jamiaopen/ooz050</pub-id>
        </nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
