<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article" dtd-version="2.0">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JMIR</journal-id>
      <journal-id journal-id-type="nlm-ta">J Med Internet Res</journal-id>
      <journal-title>Journal of Medical Internet Research</journal-title>
      <issn pub-type="epub">1438-8871</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
      <article-id pub-id-type="publisher-id">v22i7e18055</article-id>
      <article-id pub-id-type="pmid">32673230</article-id>
      <article-id pub-id-type="doi">10.2196/18055</article-id>
      <article-categories>
        <subj-group subj-group-type="heading">
          <subject>Original Paper</subject>
        </subj-group>
        <subj-group subj-group-type="article-type">
          <subject>Original Paper</subject>
        </subj-group>
      </article-categories>
      <title-group>
        <article-title>Exploring the Privacy-Preserving Properties of Word Embeddings: Algorithmic Validation Study</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="editor">
          <name>
            <surname>Eysenbach</surname>
            <given-names>Gunther</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Choemprayong</surname>
            <given-names>Songphan</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Shin</surname>
            <given-names>Soo-Yong</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib id="contrib1" contrib-type="author" corresp="yes">
          <name name-style="western">
            <surname>Abdalla</surname>
            <given-names>Mohamed</given-names>
          </name>
          <degrees>BSc, MSc</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <address>
            <institution>Department of Computer Science</institution>
            <institution>University of Toronto</institution>
            <addr-line>Bahen Centre for Information Technology﻿</addr-line>
            <addr-line>40 St. George Street, Room 4283</addr-line>
            <addr-line>Toronto, ON, M5S 2E4</addr-line>
            <country>Canada</country>
            <phone>1 4169787816</phone>
            <email>mohamed.abdalla@mail.utoronto.ca</email>
          </address>
          <xref rid="aff2" ref-type="aff">2</xref>
          <xref rid="aff3" ref-type="aff">3</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-2776-6036</ext-link>
        </contrib>
        <contrib id="contrib2" contrib-type="author">
          <name name-style="western">
            <surname>Abdalla</surname>
            <given-names>Moustafa</given-names>
          </name>
          <degrees>BSc, DPhil</degrees>
          <xref rid="aff4" ref-type="aff">4</xref>
          <xref rid="aff5" ref-type="aff">5</xref>
          <xref rid="aff6" ref-type="aff">6</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-2481-9753</ext-link>
        </contrib>
        <contrib id="contrib3" contrib-type="author">
          <name name-style="western">
            <surname>Hirst</surname>
            <given-names>Graeme</given-names>
          </name>
          <degrees>BSc, PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <xref rid="aff2" ref-type="aff">2</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-9482-1042</ext-link>
        </contrib>
        <contrib id="contrib4" contrib-type="author">
          <name name-style="western">
            <surname>Rudzicz</surname>
            <given-names>Frank</given-names>
          </name>
          <degrees>BSc, MSc, PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <xref rid="aff2" ref-type="aff">2</xref>
          <xref rid="aff7" ref-type="aff">7</xref>
          <xref rid="aff8" ref-type="aff">8</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-1139-3423</ext-link>
        </contrib>
      </contrib-group>
      <aff id="aff1">
        <label>1</label>
        <institution>Department of Computer Science</institution>
        <institution>University of Toronto</institution>
        <addr-line>Toronto, ON</addr-line>
        <country>Canada</country>
      </aff>
      <aff id="aff2">
        <label>2</label>
        <institution>The Vector Institute for Artificial Intelligence</institution>
        <addr-line>Toronto, ON</addr-line>
        <country>Canada</country>
      </aff>
      <aff id="aff3">
        <label>3</label>
        <institution>Institute for Clinical Evaluative Sciences</institution>
        <addr-line>Toronto, ON</addr-line>
        <country>Canada</country>
      </aff>
      <aff id="aff4">
        <label>4</label>
        <institution>Deptartment of Statistics</institution>
        <institution>Computational Statistics &#38; Machine Learning Group</institution>
        <institution>University of Oxford</institution>
        <addr-line>Oxford</addr-line>
        <country>United Kingdom</country>
      </aff>
      <aff id="aff5">
        <label>5</label>
        <institution>Wellcome Centre for Human Genetics</institution>
        <institution>Nuffield Dept of Medicine</institution>
        <institution>University of Oxford</institution>
        <addr-line>Oxford</addr-line>
        <country>United Kingdom</country>
      </aff>
      <aff id="aff6">
        <label>6</label>
        <institution>Harvard Medical School</institution>
        <addr-line>Boston, MA</addr-line>
        <country>United States</country>
      </aff>
      <aff id="aff7">
        <label>7</label>
        <institution>International Centre for Surgical Safety</institution>
        <institution>Li Ka Shing Knowledge Institute</institution>
        <institution>St Michael’s Hospital</institution>
        <addr-line>Toronto, ON</addr-line>
        <country>Canada</country>
      </aff>
      <aff id="aff8">
        <label>8</label>
        <institution>Surgical Safety Technologies Inc</institution>
        <addr-line>Toronto, ON</addr-line>
        <country>Canada</country>
      </aff>
      <author-notes>
        <corresp>Corresponding Author: Mohamed Abdalla <email>mohamed.abdalla@mail.utoronto.ca</email></corresp>
      </author-notes>
      <pub-date pub-type="collection">
        <month>7</month>
        <year>2020</year>
      </pub-date>
      <pub-date pub-type="epub">
        <day>15</day>
        <month>7</month>
        <year>2020</year>
      </pub-date>
      <volume>22</volume>
      <issue>7</issue>
      <elocation-id>e18055</elocation-id>
      <history>
        <date date-type="received">
          <day>17</day>
          <month>2</month>
          <year>2020</year>
        </date>
        <date date-type="rev-request">
          <day>9</day>
          <month>4</month>
          <year>2020</year>
        </date>
        <date date-type="rev-recd">
          <day>21</day>
          <month>4</month>
          <year>2020</year>
        </date>
        <date date-type="accepted">
          <day>27</day>
          <month>4</month>
          <year>2020</year>
        </date>
      </history>
      <copyright-statement>©Mohamed Abdalla, Moustafa Abdalla, Graeme Hirst, Frank Rudzicz. Originally published in the Journal of Medical Internet Research (http://www.jmir.org), 15.07.2020.</copyright-statement>
      <copyright-year>2020</copyright-year>
      <license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
        <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (https://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in the Journal of Medical Internet Research, is properly cited. The complete bibliographic information, a link to the original publication on http://www.jmir.org/, as well as this copyright and license information must be included.</p>
      </license>
      <self-uri xlink:href="https://www.jmir.org/2020/7/e18055" xlink:type="simple"/>
      <abstract>
        <sec sec-type="background">
          <title>Background</title>
          <p>Word embeddings are dense numeric vectors used to represent language in neural networks. Until recently, there had been no publicly released embeddings trained on clinical data. Our work is the first to study the privacy implications of releasing these models.</p>
        </sec>
        <sec sec-type="objective">
          <title>Objective</title>
          <p>This paper aims to demonstrate that traditional word embeddings created on clinical corpora that have been deidentified by removing personal health information (PHI) can nonetheless be exploited to reveal sensitive patient information.</p>
        </sec>
        <sec sec-type="methods">
          <title>Methods</title>
          <p>We used embeddings created from 400,000 doctor-written consultation notes and experimented with 3 common word embedding methods to explore the privacy-preserving properties of each.</p>
        </sec>
        <sec sec-type="results">
          <title>Results</title>
          <p>We found that if publicly released embeddings are trained from a corpus anonymized by PHI removal, it is possible to reconstruct up to 68.5% (n=411/600) of the full names that remain in the deidentified corpus and associated sensitive information to specific patients in the corpus from which the embeddings were created. We also found that the distance between the word vector representation of a patient’s name and a diagnostic billing code is informative and differs significantly from the distance between the name and a code not billed for that patient.</p>
        </sec>
        <sec sec-type="conclusions">
          <title>Conclusions</title>
          <p>Special care must be taken when sharing word embeddings created from clinical texts, as current approaches may compromise patient privacy. If PHI removal is used for anonymization before traditional word embeddings are trained, it is possible to attribute sensitive information to patients who have not been fully deidentified by the (necessarily imperfect) removal algorithms. A promising alternative (ie, anonymization by PHI replacement) may avoid these flaws. Our results are timely and critical, as an increasing number of researchers are pushing for publicly available health data.</p>
        </sec>
      </abstract>
      <kwd-group>
        <kwd>privacy</kwd>
        <kwd>data anonymization</kwd>
        <kwd>natural language processing</kwd>
        <kwd>personal health records</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <sec sec-type="introduction">
      <title>Introduction</title>
      <sec>
        <title>Motivation</title>
        <p>Natural language processing (NLP) is increasingly used to assist medical practitioners with various tasks, ranging from patient phenotyping to unplanned hospital readmission prediction [<xref ref-type="bibr" rid="ref1">1</xref>-<xref ref-type="bibr" rid="ref3">3</xref>]. Although a diverse range of approaches are used, a large number of NLP applications use algorithms, such as Continuous Bag of Words (CBOW), Skipgram, and Global Vectors (GloVe) [<xref ref-type="bibr" rid="ref4">4</xref>,<xref ref-type="bibr" rid="ref5">5</xref>], which represent tokens as dense numeric vectors termed as <italic>word embeddings.</italic> Most of these representations are computed from large corpora of text, such as clinical notes or narratives from health records, made available by health care providers (HCPs). Usually, before these data are provided to researchers, the HCPs apply anonymization algorithms to deidentify the personal health information (PHI) in the data. In this work, we adopted the terminology of the US Health Insurance Portability and Accountability Act (HIPAA), where <italic>PHI</italic> refers to <italic>individually identifiable health information</italic>, which includes personal identifiers ranging from names and phone numbers to fingerprints.</p>
        <p>There is a wide variety of techniques to locate and deidentify PHI in clinical text, ranging from dictionaries [<xref ref-type="bibr" rid="ref6">6</xref>] to recurrent neural networks [<xref ref-type="bibr" rid="ref7">7</xref>]. Once the sensitive information is located within a record, anonymization can employ either <italic>removal</italic> or <italic>replacement</italic>, that is, the sensitive information is either simply deleted, changed to a data-type identification tag such as <italic>*NAME*</italic>, or replaced with another randomly chosen PHI of the same type. Many publicly available resources use PHI removal, for example, the Multiparameter Intelligence Monitoring in Intensive Care (MIMIC-III) dataset [<xref ref-type="bibr" rid="ref8">8</xref>] used informative deidentification tags. However, in this paper, we showed that as no perfect PHI search algorithm exists, data <italic>secured</italic> this way can be exploited because traces of identities remain in the text and are detectable even in embeddings that are generated from it.</p>
        <p>Specifically, we discussed the privacy concerns that arise from publicly releasing word embeddings that have been trained on clinical notes secured using the PHI removal paradigm. At first glance, it may seem that releasing word embeddings has low risk because of the unordered nature of these models; all that is released is a list of words, arbitrarily ordered, with dense numeric vectors associated with each word. However, through our experiments with three of the most popular embedding techniques, we showed that they can be leveraged to learn information presumed to be removed.</p>
        <p>Our work relies on the assumption that some name tokens will inevitably be missed by the deidentification process. This is a realistic assumption as, to date, there is no deidentification algorithm that has perfect recall (ie, captures all PHI). This necessarily means that the word list of the embedding model will contain names that are not properly protected. We also assume that malicious actors will be able to successfully identify these tokens from a very large wordlist. Given these two assumptions, a publicly released traditional word embedding model then presents a small, but nontrivial, risk of patient identities being attacked. This risk is relative to the number of patients in the data set and the particular deidentification and embedding algorithms used. Up to 0.6% of all patients may be at risk of having their full names detected in a data set (built from individual name tokens), and as many as 0.02% to 0.15% may have their full name associated with a diagnosis. Although these risks appear small, with the growing number of publicly available embeddings trained on clinical data, we aimed to draw attention to the possible critical mass of potential privacy exposure.</p>
        <p>Specifically, we showed that (1) it is possible to associate name tokens together to form <italic>true</italic> name pairs, (2) there is a significant difference between the distances of diagnoses that have been associated with a patient and those of diagnoses not associated, and this is true both at the population level and at the patient level, and (3) it is possible for a malicious actor to determine diagnoses assigned to multiple patients, using only precomputed embeddings. In this work, we will refer to diagnostic codes and diagnoses interchangeably, although this is not, of course, a general equivalence. Here, we take the diagnostic code simply as an indication of the condition that the patient is suspected of having, which is sensitive information that must be protected. Finally, we replicate these results and perform further experiments with a synthetic data set that we make publicly available.</p>
        <p>Our work is the first to study the privacy implications of releasing word embeddings. This demonstrates how anonymizing clinical notes using PHI removal is likely to leave sensitive patient information vulnerable. By methodically exploring a variety of algorithms and hyperparameters, we showed that our observation holds in the general case. Furthermore, we demonstrated that it is easier to reassociate sensitive information with rare names compared with common ones. Finally, we argue that, given our results, data holders and providers should explore whether other paradigms, such as PHI replacement, are more successful in securing sensitive information when compared with PHI removal.</p>
      </sec>
      <sec>
        <title>Background</title>
        <sec>
          <title>Clinical Word Embeddings</title>
          <p>Word embeddings (ie, <italic>word vectors</italic> or <italic>distributed representations</italic>) are dense numeric vectors used to represent words. Many word embedding techniques fall into one of two categories: low-rank approximations of co-occurrence matrices [<xref ref-type="bibr" rid="ref4">4</xref>,<xref ref-type="bibr" rid="ref9">9</xref>] and those created using shallow neural networks using contextual information [<xref ref-type="bibr" rid="ref10">10</xref>]. There is also a recent and growing body of embedding models employing deeper neural networks to create contextual word embeddings, which vary depending on the surrounding context [<xref ref-type="bibr" rid="ref11">11</xref>,<xref ref-type="bibr" rid="ref12">12</xref>].</p>
          <p>Inspired by the distributional hypothesis [<xref ref-type="bibr" rid="ref13">13</xref>], word embeddings trained on health care data are strongly correlated with human-annotated word similarity metrics for medical terms [<xref ref-type="bibr" rid="ref2">2</xref>], although their performance on clinical classification tasks is strongly dependent on the quality, size, and type of data from which they are created [<xref ref-type="bibr" rid="ref14">14</xref>]. In fact, embeddings created from clinically related data (eg, clinical notes and biomedical text, such as a collection of all PubMed Central articles and PubMed abstracts), often performed better than, and never performed worse than, unspecialized corpora [<xref ref-type="bibr" rid="ref2">2</xref>].</p>
          <p>Until recently, there had been no publicly released embeddings trained on clinical data [<xref ref-type="bibr" rid="ref15">15</xref>-<xref ref-type="bibr" rid="ref18">18</xref>]. However, some newly released embeddings [<xref ref-type="bibr" rid="ref15">15</xref>-<xref ref-type="bibr" rid="ref18">18</xref>] are trained using contextual word embedding models on MIMIC, which itself uses PHI removal to abide by HIPAA regulations. Our work demonstrates how, if no additional security measures are taken, then traditional (noncontextual) models may be compromised. More work is required to assess whether our findings hold for the four new models as well (ie, contextual word embeddings) [<xref ref-type="bibr" rid="ref15">15</xref>-<xref ref-type="bibr" rid="ref18">18</xref>].</p>
        </sec>
        <sec>
          <title>Privacy of Clinical Notes</title>
          <p>There are 3 main approaches to protect the privacy of patients: dictionary-based, statistical, and hybrid approaches. Dictionary-based methods often use large wordlists or predefined regular expressions to locate private information in the text [<xref ref-type="bibr" rid="ref6">6</xref>,<xref ref-type="bibr" rid="ref19">19</xref>]. Statistical methods, often more robust than dictionary-based approaches [<xref ref-type="bibr" rid="ref20">20</xref>], use models such as recurrent neural networks [<xref ref-type="bibr" rid="ref7">7</xref>] to automatically detect private information. Hybrid methods combine the two approaches to compensate for their respective weaknesses [<xref ref-type="bibr" rid="ref21">21</xref>]. No matter the method used to detect PHI, once it is detected, there are two ways to secure the data: PHI removal and PHI replacement.</p>
        </sec>
        <sec>
          <title>Personal Health Information Removal</title>
          <p>In PHI removal, sensitive information is located in text via a specialized search algorithm and then is either deleted or replaced with an informative deidentification tag (eg, all names are replaced with <italic>[*NAME*]</italic>). Although simple and common, this approach is not secure and can be easily exploited. Given that no PHI search algorithm is perfect, as a data set increases in size, it becomes increasingly certain that some PHI will be missed. Thus, if clinical notes are shared in a text format after this technique is used for deidentification, a malicious party can uncover names missed by the algorithm by manually inspecting the data. We demonstrate later that word embeddings created from such data are also vulnerable to similar exploits.</p>
        </sec>
        <sec>
          <title>Personal Health Information Replacement</title>
          <p>In PHI replacement, sensitive information, once located within the text by the search algorithm, is replaced with other information of the same type; for example, names can be randomly replaced with other names. This approach is more secure than PHI removal as it obscures instances where the PHI detection algorithm has failed and thus provides the data-curator with plausible deniability for any specific record.</p>
          <p>We advocate that HCPs and data providers employ this paradigm because, if done correctly, it is much harder to exploit and thus reduces the risk to patient privacy. It is also a simple and effective way to protect against the exploitation of word embeddings that we demonstrated in this work.</p>
        </sec>
      </sec>
    </sec>
    <sec sec-type="methods">
      <title>Methods</title>
      <sec>
        <title>Data</title>
        <p>In our experiments, we used consultation notes. In <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>, we demonstrate how these findings are reproducible with an experiment performed with a selected subset of Wikipedia pages. We made the latter publicly available alongside the code. For all texts, we removed all punctuation and numeric characters, and we lowercased all text but performed no lemmatization, tokenization, or any other preprocessing.</p>
        <p>We used consultation notes provided to the authors by ICES (formerly known as the Institute for Clinical Evaluative Sciences) under data sharing agreements with physicians for the purposes of evaluation and research. Consultation notes are written by specialist physicians and other health care consultants to a patient’s family physician. They describe the tests performed, results observed, and other details that the specialist physician or health care consultant considers relevant. We compiled patients' consultation notes and all their prescribed diagnostic codes that are indicative of suspected diagnoses and ordered tests, and are therefore sensitive health information that must not be connected to patient identities. The billing codes table includes text fields describing each code in 1 to 3 words, for example, <italic>colon screening</italic>. These data sets are linked using unique encoded identifiers and analyzed using ICES.</p>
        <p>Although this work is conducted at ICES, ICES does not grant its research affiliates (including the authors of this paper) access to <italic>true</italic> patient names, but replaces them in the manner described earlier (PHI replacement), using a semimanual, dictionary-based masking process to consistently replace each true name with a randomly chosen fake name. We used heuristics to detect names in the notes. More concretely, we looked for semistructured notes that have <italic>Name: str1, …, strN</italic> (representing a series of alphabetical tokens separated by commas followed by a semicolon) to indicate the presence of a name. The heuristic is not 100% accurate, which is why, in <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>, we can provide only an estimate of how many true names exist by manually analyzing a randomly sampled set.</p>
        <p>We perform our experiments on clinical consultation notes for which we can locate the associated fake patient name. For our experiments, we treat the fake names as if they were the <italic>true</italic> names and removed 99% of them, thus emulating current PHI removal algorithms [<xref ref-type="bibr" rid="ref7">7</xref>]. This protected data set is then used as the first step of our experiments, as shown in <xref rid="figure1" ref-type="fig">Figure 1</xref>. Detailed information regarding the data is provided in <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>.</p>
        <fig id="figure1" position="float">
          <label>Figure 1</label>
          <caption>
            <p>Process flow for gathering and preparing the clinical notes for embedding generation and experimentation.</p>
          </caption>
          <graphic xlink:href="jmir_v22i7e18055_fig1.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Experiments</title>
        <sec>
          <title>Experimental Hypothesis</title>
          <p>The intuition behind reidentifying patient information solely from word embeddings stems from the <italic>distributional hypothesis</italic> [<xref ref-type="bibr" rid="ref13">13</xref>]—that words appearing in similar contexts tend to have similar meanings and therefore have closer vector representations than other words. Knowing this, we expect differences between both:</p>
          <list list-type="order">
            <list-item>
              <p>The average distance between the tokens that make up a person’s name, compared with tokens from different names.</p>
            </list-item>
            <list-item>
              <p>The average distance in vector space between a person's name and their diagnoses (referred to as the in-group), compared with the average distance between their name and those diagnoses with which they are not associated (referred to as the out-group.</p>
            </list-item>
          </list>
          <p>If there is a large enough distance between a person's in-group and out-group, then this observation could be used to extract sensitive information thought to have been hidden by the unordered nature of embeddings. In the following sections, we validated this hypothesis empirically.</p>
        </sec>
        <sec>
          <title>Experiment 1: Name Reconstruction Experiment</title>
          <p>In the first experiment, we tested whether it is possible to reconstruct true name pairs simply from a list of individual name tokens. <xref rid="figure2" ref-type="fig">Figure 2</xref> presents the steps of this experiment, picking up from the last step of <xref rid="figure1" ref-type="fig">Figure 1</xref>.</p>
          <fig id="figure2" position="float">
            <label>Figure 2</label>
            <caption>
              <p>Process flow for generating word embeddings and performing the name reconstruction experiment.</p>
            </caption>
            <graphic xlink:href="jmir_v22i7e18055_fig2.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </fig>
          <p>A list of individual name tokens, corresponding to the fifth step in <xref rid="figure2" ref-type="fig">Figure 2</xref>, is easily generated by manual exploration of the words. However, as we left 1% of the names, to emulate the imperfect deidentification algorithms, we knew all the tokens (ie, the 1% of name tokens purposefully left in place).</p>
          <p>We performed this experiment on our consultation notes data set, where over 99% of names were removed to emulate a PHI removal approach and only 1054 unique name tokens (from 650 full names) remained in the text.</p>
          <p>We performed our experiment with 3 commonly used traditional word embedding algorithms (CBOW, Skipgram, and GloVe) for clinical prediction and modeling tasks. For each, we tested a variety of hyperparameters. Where a specific hyperparameter is not explicitly mentioned, we used the default hyperparameter of the training model, which can be found in <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>.</p>
          <p>However, for the sixth step, an attacker would not know how many full names were in the data set. If we assume that each name is composed of 2 tokens and none of the names share any name tokens; we would expect the number of complete names to be half the number of name tokens (ie, 1052/2 complete names). Relaxing both assumptions increases the expected names. Given name tokens <italic>A</italic> and <italic>B</italic>, we considered a name to exist if either 〈<italic>A,B</italic>〉 or 〈<italic>B,A</italic>〉 exist as names (ie, ignoring ordering). On this data set, we created many word embedding models (n=88) with a wide set of hyperparameters (ie, model specifications) that included variations in the distance metric (cosine or cityblock) and context window size.</p>
        </sec>
        <sec>
          <title>Experiment 2: Name-Diagnostic Code Association Experiment</title>
          <p>In this section, we explored the second part of our hypothesis: is there a difference between the average distance in vector space between a person's name and their diagnoses (their <italic>in-group</italic>) compared with the average distance between their name and those diagnoses with which they are <italic>not</italic> associated (their <italic>out-group</italic>)?</p>
          <p>For this experiment, we used the same data and tested the properties of the same word embedding algorithms for various hyperparameters, as in the last experiment. We first define a patient's name vector as the average of the vectors of its components (ie<italic>,</italic> first, last, and possibly middle names). Here, <italic>numtoken</italic> is the number of space-separated tokens in a string and is the vector representation of the <italic>i</italic>th token of the name:</p>
          <disp-formula>
            <graphic xlink:href="jmir_v22i7e18055_fig8.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </disp-formula>
          <p>Second, we defined the in-group <italic>d<sub>in</sub></italic> as the set of diagnoses for <italic>name</italic> and the out-group <italic>d<sub>out</sub></italic>, as all other diagnoses, with <italic>d<sub>i</sub></italic> representing any individual diagnosis. The average distance for each of these groups from their respective names are referred to as <italic>in_group</italic> and <italic>out_group,</italic> respectively:</p>
          <disp-formula>
            <graphic xlink:href="jmir_v22i7e18055_fig9.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </disp-formula>
          <disp-formula>
            <graphic xlink:href="jmir_v22i7e18055_fig10.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </disp-formula>
          <p>We presented the results using the <italic>cityblock</italic> distance (ie, the Manhattan distance) instead of the cosine distance because it performs better at this task (by uncovering more information), and past work has shown that the vector magnitude (ie<italic>,</italic> the sum of all dimensions) is affected by the number of times that the word occurs in the corpus [<xref ref-type="bibr" rid="ref22">22</xref>]. However, our experiments were performed using the cosine distance metric as well, and complete results can be found in <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>.</p>
          <p>Initially, we explored the raw data (ie, without any deidentification algorithm) by plotting the difference between the in- and out-groups for names that occur below different frequency thresholds. A name is below the threshold if the average counts of its components are below that threshold. For example, if “James” occurs 201 times in the corpus and “Qwerty” appears twice, then “James Qwerty” is below an arbitrary threshold of 200 (101.5&#60;200).</p>
          <p><xref rid="figure3" ref-type="fig">Figure 3</xref> shows that the more frequently a name occurs, the smaller the difference between the in-groups and out-groups. Nonetheless, the difference is still pronounced when all names are considered, with the lowest value being just under 5. Surprisingly, against our intuition, the in-group is larger than the out-group. We saw this result consistently throughout our testing described in the following sections.</p>
          <fig id="figure3" position="float">
            <label>Figure 3</label>
            <caption>
              <p>Relationship between frequency of name occurrence and the average difference between the in-group and out-group for patients. This graph is generated from an experiment run on a GloVe model with a dimension of 100, window of 10, learning rate of 0.05, minimum occurrence of 1, and alpha of .75.</p>
            </caption>
            <graphic xlink:href="jmir_v22i7e18055_fig3.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </fig>
        </sec>
        <sec>
          <title>Statistical Testing</title>
          <p>Given our initial observation that, on raw data, there is a difference between in- and out-groups on the population level on raw data, we now examine if the observed differences are statistically significant at both the population and patient levels for various embedding algorithms and hyperparameters on the deidentified data set (ie, 99% of names have been removed to emulate an optimum real-life data sharing scenario). A diagram of the experimental process is shown in <xref rid="figure4" ref-type="fig">Figure 4</xref>.</p>
          <fig id="figure4" position="float">
            <label>Figure 4</label>
            <caption>
              <p>Process flow for generating word embeddings and performing statistical testing. For population-level statistical testing, we performed a Wilcoxon signed-rank test, and for patient-level statistical testing, we calculated empirical <italic>P</italic> values using 1000 randomly generated permutations.</p>
            </caption>
            <graphic xlink:href="jmir_v22i7e18055_fig4.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </fig>
        </sec>
        <sec>
          <title>Experiment 2a: Population-Level Statistical Testing</title>
          <p>In this experiment, we aimed to determine whether the difference between the in- and out-groups on the population level is statistically significant.</p>
          <p>Here, as with all the clinical text experiments, the embedding model is trained using all consultation notes after 99% of the names have been removed. Using the same setup as in the previous section to obtain distances between in- and out-groups, we used the Wilcoxon signed-rank test to compare the pairings of in- and out-groups for each name on the population level. The Wilcoxon signed-rank test is nonparametric and, unlike the paired Student two-tailed <italic>t</italic> test, makes no assumptions regarding normality.</p>
          <p>This experiment is performed for various embedding algorithms, distance metrics, and hyperparameter ranges.</p>
        </sec>
        <sec>
          <title>Experiment 2b: Patient-Level Statistical Testing</title>
          <p>Here, we explored whether there is a statistically significant difference between the in- and out-groups for each patient, which would indicate that an individual patient is at risk of having their diagnostic code uncovered.</p>
          <p>In this experiment, we compared the average difference between a patient’s in-group and the out-group. Although each comparison will result in a <italic>P</italic> value for each patient, for brevity and privacy, we do not report the per patient analysis of the ICES data, but instead report the number of patients for which the difference is significant after correcting for multiple comparisons. To determine statistical significance at the patient level, we calculated empirical <italic>P</italic> values by randomly sampling in- and out-groups generated using 1000 permutations of the same size from the same data set.</p>
          <p>We experimented with various embedding algorithms, distance metrics, and hyperparameter ranges.</p>
        </sec>
        <sec>
          <title>Experiment 3: Scenario Simulation</title>
          <p>In this experiment, we performed a hypothetical attack to examine whether the results of the previous 2 experiments demonstrate an actionable level of risk. Assuming the role of an attacker who has access only to released embeddings built from doctor-patient consultation notes that have been secured by using PHI removal, we showed how we are able to associate name tokens that were missed by PHI removal to arrive at a list of complete patient names and that we are able to associate these names with some target diagnoses.</p>
          <p>For this hypothetical scenario, we used the same data and tested the properties of the same word embedding algorithms for various hyperparameters as in the last experiment.</p>
          <p>The attack is as follows:</p>
          <list list-type="order">
            <list-item>
              <p>Identify a list of target diagnoses that we wish to attribute to patients. As an example, we considered the following set of diagnoses: constipation, diarrhea, vaginitis, sexual dysfunction, urinary infection, herpes genitalis, dementia, anorexia, alcoholism, threatened abortion, and AIDS.</p>
            </list-item>
            <list-item>
              <p>For each name, calculate the 5 diagnoses that are farthest from the name.</p>
            </list-item>
            <list-item>
              <p>Using these 5 diagnoses as the basis for prediction, we calculated Top-1 (A@1) and Top-5 (A@5) accuracy.</p>
            </list-item>
          </list>
          <p>To ensure that our results are not an artifact of the selected diagnoses, we repeated the above experiment 1000 times for each tested hyperparameter, randomly selecting 30 target diagnoses. To be as stringent as possible, we chose from diagnoses that appeared at least 10 times in the data (which likely will result in a pessimistic bias, as demonstrated in <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>).</p>
        </sec>
      </sec>
    </sec>
    <sec sec-type="results">
      <title>Results</title>
      <sec>
        <title>Experiment 1: Name Reconstruction Experiment</title>
        <p>The results of this experiment demonstrate that it is possible to reconstruct true name pairs simply from a list of individual name tokens and their respective embeddings.</p>
        <p>In this section, we present the results for various context window sizes, an expected name list of size 600, and a cosine distance metric. We observed that up to 68.5% (411/600) of the paired tokens come from true names, as shown in <xref ref-type="table" rid="table1">Table 1</xref> and <xref rid="figure5" ref-type="fig">Figure 5</xref>. As there are over 170,000 name-pair combinations, these embeddings clearly carry patient information that can be identified, thus affirming our hypothesis. The complete results for other hyperparameters, the number of names expected, and the cityblock distance metric are presented in <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>.</p>
        <table-wrap position="float" id="table1">
          <label>Table 1</label>
          <caption>
            <p>The number and percentage of paired tokens that are part of true names as a function of context window size, using the cosine distance metric of the first 600 paired tokens sorted in ascending order.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="250"/>
            <col width="280"/>
            <col width="230"/>
            <col width="240"/>
            <thead>
              <tr valign="bottom">
                <td>Context window size</td>
                <td>Skipgram names, n (%)</td>
                <td>CBOW<sup>a</sup> names, n (%)</td>
                <td>GLoVe<sup>b</sup> names, n (%)</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>1</td>
                <td>51 (8.5)</td>
                <td>17 (2.8)</td>
                <td>8 (1.3)<sup>c</sup></td>
              </tr>
              <tr valign="top">
                <td>3</td>
                <td>369 (61.5)</td>
                <td>265 (44.2)</td>
                <td>158 (26.3)</td>
              </tr>
              <tr valign="top">
                <td>5</td>
                <td>393 (65.6)</td>
                <td>323 (53.8)</td>
                <td>278 (46.3)</td>
              </tr>
              <tr valign="top">
                <td>7</td>
                <td>410 (68.3)</td>
                <td>331 (55.2)</td>
                <td>317 (52.8)</td>
              </tr>
              <tr valign="top">
                <td>9</td>
                <td>411 (68.5)</td>
                <td>340 (56.7)</td>
                <td>323 (53.8)</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table1fn1">
              <p><sup>a</sup>CBOW: Continuous Bag of Words.</p>
            </fn>
            <fn id="table1fn2">
              <p><sup>b</sup>GLoVe: Global Vectors.</p>
            </fn>
            <fn id="table1fn3">
              <p><sup>c</sup>Result not significant after correcting for multiple comparisons using the Holm-Bonferroni correction.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <fig id="figure5" position="float">
          <label>Figure 5</label>
          <caption>
            <p>Visual representation of the percentage of paired names belonging to true names from the first 600 paired tokens when sorted in ascending order.</p>
          </caption>
          <graphic xlink:href="jmir_v22i7e18055_fig5.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Experiment 2: Name-Diagnostic Code Association Experiment</title>
        <sec>
          <title>Experiment 2a: Population-Level Statistical Testing</title>
          <p>The results of this experiment indicate that, at the population level, the average difference between the in- and out-groups per patient is statistically significant. <xref ref-type="table" rid="table2">Table 2</xref> and <xref rid="figure6" ref-type="fig">Figure 6</xref> show the results for various embedding algorithms, varying context window sizes, and a cityblock distance metric. The complete results for other hyperparameters, other distance measures, and absolute distances are shown in <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>.</p>
          <table-wrap position="float" id="table2">
            <label>Table 2</label>
            <caption>
              <p>Difference between the in-group and out-group as a function of context window size for various word embedding algorithms using the cityblock distance metric. The differences are relative distances between word embedding vectors in an n-dimensional space.</p>
            </caption>
            <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
              <col width="250"/>
              <col width="280"/>
              <col width="230"/>
              <col width="240"/>
              <thead>
                <tr valign="bottom">
                  <td>Context window size<sup>a</sup></td>
                  <td>Skipgram difference</td>
                  <td>CBOW<sup>b</sup> difference</td>
                  <td>GLoVe<sup>c</sup> difference</td>
                </tr>
              </thead>
              <tbody>
                <tr valign="top">
                  <td>1</td>
                  <td>3.91</td>
                  <td>7.59</td>
                  <td>4.85</td>
                </tr>
                <tr valign="top">
                  <td>3</td>
                  <td>2.88</td>
                  <td>28.53</td>
                  <td>5.69</td>
                </tr>
                <tr valign="top">
                  <td>5</td>
                  <td>2.33</td>
                  <td>39.55</td>
                  <td>5.45</td>
                </tr>
                <tr valign="top">
                  <td>7</td>
                  <td>1.84</td>
                  <td>47.10</td>
                  <td>5.12</td>
                </tr>
                <tr valign="top">
                  <td>9</td>
                  <td>1.51</td>
                  <td>51.61</td>
                  <td>5.54</td>
                </tr>
              </tbody>
            </table>
            <table-wrap-foot>
              <fn id="table2fn1">
                <p><sup>a</sup>All differences were statistically significant after correcting for multiple comparisons.</p>
              </fn>
              <fn id="table2fn2">
                <p><sup>b</sup>CBOW: Continuous Bag of Words.</p>
              </fn>
              <fn id="table2fn3">
                <p><sup>c</sup>GLoVe: Global Vectors.</p>
              </fn>
            </table-wrap-foot>
          </table-wrap>
          <fig id="figure6" position="float">
            <label>Figure 6</label>
            <caption>
              <p>Visualization of the difference between the in-group and the out-group as a function of context window size for various word embedding algorithms using the cityblock distance metric.</p>
            </caption>
            <graphic xlink:href="jmir_v22i7e18055_fig6.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </fig>
          <p>Given our selected hyperparameters, we observed that for all sizes tested and for all embedding techniques, the difference between the in- and out-groups on the population level was statistically significant with <italic>P</italic>&#60;.001 calculated using the Wilcoxon test, after correcting for multiple comparisons using the Holm-Bonferroni correction [<xref ref-type="bibr" rid="ref23">23</xref>]. The Holm-Bonferroni correction is a sequentially rejective procedure for correcting multiple comparisons that keeps the family-wise type I error bounded. <xref rid="figure6" ref-type="fig">Figure 6</xref> shows that the difference between the in-group and out-group decreases for embeddings created with the Skipgram algorithm as the context window increases. Conversely, the difference grows for CBOW, while it remains relatively stable for all GloVe models.</p>
        </sec>
        <sec>
          <title>Experiment 2b: Patient-Level Statistical Testing</title>
          <p>Building on our previous observations, the results of this experiment indicate that, at the patient level, for a percentage of examined patients (up to 449/638, 70.4%), the average difference between in- and out-groups per patient is statistically significant.</p>
          <p><xref ref-type="table" rid="table3">Table 3</xref> and <xref rid="figure7" ref-type="fig">Figure 7</xref> show the results for various embedding algorithms, varying context window sizes, and a cityblock distance metric. The complete results for other hyperparameters, other distance measures, and absolute distances are shown in <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>.</p>
          <table-wrap position="float" id="table3">
            <label>Table 3</label>
            <caption>
              <p>The percentage of patients whose diagnoses are identifiable due to a statistically significant difference between the in-group and out-group as a function of context window size for various word embedding algorithms using the cityblock distance metric.</p>
            </caption>
            <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
              <col width="250"/>
              <col width="280"/>
              <col width="230"/>
              <col width="240"/>
              <thead>
                <tr valign="bottom">
                  <td>Size</td>
                  <td>Skipgram patients, %</td>
                  <td>CBOW<sup>a</sup> patients, %</td>
                  <td>GLoVe<sup>b</sup> patients, %</td>
                </tr>
              </thead>
              <tbody>
                <tr valign="top">
                  <td>1</td>
                  <td>49 (7.7)</td>
                  <td>77 (12.1)</td>
                  <td>400 (62.7)</td>
                </tr>
                <tr valign="top">
                  <td>3</td>
                  <td>41 (6.4)</td>
                  <td>149 (23.4)</td>
                  <td>401 (62.8)</td>
                </tr>
                <tr valign="top">
                  <td>5</td>
                  <td>33 (5.2)</td>
                  <td>152 (23.8)</td>
                  <td>403 (63.2)</td>
                </tr>
                <tr valign="top">
                  <td>7</td>
                  <td>16 (2.5)</td>
                  <td>153 (24.0)</td>
                  <td>380 (59.6)</td>
                </tr>
                <tr valign="top">
                  <td>9</td>
                  <td>12 (1.9)</td>
                  <td>153 (24.0)</td>
                  <td>449 (70.4)</td>
                </tr>
              </tbody>
            </table>
            <table-wrap-foot>
              <fn id="table3fn1">
                <p><sup>a</sup>CBOW: Continuous Bag of Words.</p>
              </fn>
              <fn id="table3fn2">
                <p><sup>b</sup>GLoVe: Global Vectors.</p>
              </fn>
            </table-wrap-foot>
          </table-wrap>
          <fig id="figure7" position="float">
            <label>Figure 7</label>
            <caption>
              <p>Visualization of the percentage of patients who have a significant difference between their in- and out-groups as a function of context window size for multiple word embedding algorithms using the cityblock distance metric.</p>
            </caption>
            <graphic xlink:href="jmir_v22i7e18055_fig7.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </fig>
          <p><xref ref-type="table" rid="table3">Table 3</xref> presents the patient-level analysis for different context window sizes. As shown in <xref rid="figure7" ref-type="fig">Figure 7</xref>, using the CBOW algorithm, an increasing window size initially correlates positively with the number of vulnerable patients, defined as having a significant difference between the in-group and out-group. The opposite trend is observed for the Skipgram model. Context window size does not appear to have an effect on word embeddings created using GloVe, as the number of patients remains relatively stable.</p>
        </sec>
      </sec>
      <sec>
        <title>Experiment 3: Scenario Simulation</title>
        <p>Having demonstrated that the difference between in- and out-groups is statistically significant, in this section, we showed that our hypothetical attack results in an actionable level of risk, that is, an attacker who has access only to released embeddings built from doctor-patient consultation notes that have been secured by using PHI removal may be able to arrive at a list of complete patient names, and associate these names with target diagnoses.</p>
        <p>We observed that for our chosen target diagnoses (ie, constipation, diarrhea, vaginitis, sexual dysfunction, urinary infection, herpes genitalis, dementia, anorexia, alcoholism, threatened abortion, and AIDS) our approach out performs the majority baseline for both top 1 (A@1) and top 5 (A@5) accuracy of 0.00 and 0.70, respectively, <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref> (top <italic>n</italic> rate is the fraction of examples for which the correct label is among the <italic>n</italic> labels considered most probable by the model). The complete results for all hyperparameters as well as both distance metrics are presented in <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>.</p>
        <p>We observed similar results when the above experiment was repeated 1000 times for each tested hyperparameter, randomly selecting 30 target diagnoses. <xref ref-type="table" rid="table4">Table 4</xref> shows how often our attacker’s approach surpasses the baseline of choosing the majority diagnoses for both top-1 and top-5 accuracies. We show that we can consistently beat strong baselines, although the highest top-1 and top-5 accuracies are modest at 0.08 and 0.15, respectively. The complete results for all hyperparameters as well as both distance metrics are presented in <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>.</p>
        <table-wrap position="float" id="table4">
          <label>Table 4</label>
          <caption>
            <p>The percentage of times using a word embedding–based attack beats the majority baseline for A@1 and A@5 for various context window sizes over 1000 random diagnosis selections.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="250"/>
            <col width="280"/>
            <col width="230"/>
            <col width="240"/>
            <thead>
              <tr valign="bottom">
                <td>Context window size<sup>a</sup></td>
                <td>Skipgram A@1, A@5</td>
                <td>CBOW<sup>b</sup> A@1, A@5</td>
                <td>GLoVe<sup>c</sup> A@1, A@5</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>1</td>
                <td>55.8, 56.7</td>
                <td>61.8, 61.8</td>
                <td>55.4, 56.9</td>
              </tr>
              <tr valign="top">
                <td>3</td>
                <td>55.6, 53.1</td>
                <td>51.2, 52.6</td>
                <td>60.5, 59.5</td>
              </tr>
              <tr valign="top">
                <td>5</td>
                <td>57.4, 55.6</td>
                <td>53.6, 54.5</td>
                <td>59.4, 57.2</td>
              </tr>
              <tr valign="top">
                <td>7</td>
                <td>57.4, 53.5</td>
                <td>54.6, 53.9</td>
                <td>55.9, 54.0</td>
              </tr>
              <tr valign="top">
                <td>9</td>
                <td>57.2, 53.2</td>
                <td>53.7, 51.2</td>
                <td>60.6, 56.7</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table4fn1">
              <p><sup>a</sup>We observed that the majority baseline is surpassed consistently and up to 60% of the time.</p>
            </fn>
            <fn id="table4fn2">
              <p><sup>b</sup>CBOW: Continuous Bag of Words.</p>
            </fn>
            <fn id="table4fn3">
              <p><sup>c</sup>GLoVe: Global Vectors.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
      </sec>
    </sec>
    <sec sec-type="discussion">
      <title>Discussion</title>
      <sec>
        <title>Principal Findings</title>
        <p>In this work, we have shown the following:</p>
        <list list-type="bullet">
          <list-item>
            <p>There is a statistically significant difference between the distance of patients’ in- and out-groups at the population level.</p>
          </list-item>
          <list-item>
            <p>For many individual patients, the difference between their personal in-group and out-group is also statistically significant.</p>
          </list-item>
          <list-item>
            <p>A malicious actor working only with word embeddings may identify full names occurring in the training corpus of the embeddings as well as sensitive attributes associated with these names.</p>
          </list-item>
        </list>
      </sec>
      <sec>
        <title>Limitations</title>
        <p>We explored the induced privacy (or lack of privacy) of embeddings created from medical notes. We empirically highlighted the security risks of sharing clinically sourced word embeddings. Although their nature does serve to obfuscate information, we have shown that it is still possible to connect PHI to names from word embeddings secured using PHI removal. There is much variation in the risks observed in this work, which are dependent on imperfect deidentification algorithms and very skilled attackers. The actual risk to patient information, while nonzero, remains small and dependent on many variables such as the attack strategy, deidentification method, and embedding algorithm. We therefore advocate for more research to see whether the adoption of PHI replacement would better secure released embeddings. In addition to deidentification methods (where more research needs to be done), appropriate controls on who can access the anonymized data and oversight of these data are also recommended.</p>
      </sec>
      <sec>
        <title>Conclusion</title>
        <p>We have focused on the reidentification of names and their association with diagnostic codes, although other sensitive PHIs may also be vulnerable. We demonstrated how sharing word embeddings trained on clinical notes that have been protected using only PHI removal is not safe, as any PHI missed by the algorithm will remain in its original context. The risk of obtaining sensitive information from embeddings can be diminished by applying the anonymization methods of PHI replacement on the clinical notes before training the embedding, that is, when all known PHIs have been randomly shuffled, it becomes much more difficult (but not impossible) to determine which names in the data set belong to true patients, as the names that are shuffled together will behave in a manner similar to true names that have been missed. Such embeddings can theoretically still be at risk if an attacker is able to determine how to differentiate between fake and true names. However, this would mitigate the methods of attack described in this work, thereby making the created embeddings more secure. Alternatively, noise can be added to the generated embedding model to induce privacy and reduce risk. This risk reduction is, naturally, relative to the amount of noise added, and determining the exact amount of noise without distorting the signal or degrading performance is the subject of future research.</p>
        <p>Regarding reassociating name tokens or associating names with diagnostic codes, Skipgram is least effective at preserving privacy, followed by CBOW and GloVe. However, when examining the number of statistically significant differences, we observe the opposite ranking. Although many sentence- and text-classification tasks observe little difference in downstream performance between these 3 algorithms, past work [<xref ref-type="bibr" rid="ref5">5</xref>] has demonstrated differences in the ability of these algorithms to represent words. For example, Skipgram can perform better than CBOW for more frequent words [<xref ref-type="bibr" rid="ref5">5</xref>], possibly explaining the difference in modeling names (which are infrequent in our vocabulary).</p>
        <p>As expected, tokens from the same complete name have closer vector representations. However, despite our intuition, we find that the in-group is surprisingly larger than the out-group. That is, the average distance between a name and a diagnosis is larger if the person with that name has a diagnosis. This was consistent among all parameters tested, and among all 3 embedding models. This was also observed in our novel data set. This was perplexing because our expectation of word embeddings informs us that words that occur in similar contexts should be closer together, and in-group diagnoses are often in the same note as the name, while out-group diagnoses are not in the note at all. Even though the name and diagnosis tokens may not co-occur directly, as they would gravitate to words that co-occur with both, this would result in the names and in-group diagnoses being closer. Although a deeper theoretical investigation remains to be conducted, we hypothesize that this may be due to interaction effects within the contexts; names are quite tightly clustered together, and they rarely occur in the same context window of the diagnosis with which they are associated. It may be that these other names draw the <italic>common</italic> diagnoses closer, as they occur with more names, in turn leaving the less common, but relevant, diagnoses further from the <italic>name</italic> cluster. This requires further research.</p>
      </sec>
    </sec>
  </body>
  <back>
    <app-group>
      <supplementary-material id="app1">
        <label>Multimedia Appendix 1</label>
        <p>Wikipedia Appendix.</p>
        <media xlink:href="jmir_v22i7e18055_app1.docx" xlink:title="DOCX File , 14 KB"/>
      </supplementary-material>
      <supplementary-material id="app2">
        <label>Multimedia Appendix 2</label>
        <p>ICES Appendix.</p>
        <media xlink:href="jmir_v22i7e18055_app2.docx" xlink:title="DOCX File , 59 KB"/>
      </supplementary-material>
    </app-group>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">CBOW</term>
          <def>
            <p>Continuous Bag of Words</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb2">GLoVe</term>
          <def>
            <p>Global Vectors</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb3">HCP</term>
          <def>
            <p>health care provider</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb4">HIPAA</term>
          <def>
            <p>Health Insurance Portability and Accountability Act</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb5">MIMIC</term>
          <def>
            <p>Multiparameter Intelligence Monitoring in Intensive Care</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb6">NLP</term>
          <def>
            <p>natural language processing</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb7">PHI</term>
          <def>
            <p>personal health information</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <ack>
      <p>The authors thank the helpful staff at ICES (Dr Liisa Jaakkimainen, Dr Therese Stukel, Elisa Candido, Daniella Barron) for granting us access to data and for their insight into our manuscript. This study was supported by ICES, which is funded by an annual grant from the Ontario Ministry of Health and Long-Term Care. The analyses, conclusions, opinions, and statements expressed herein are solely those of the authors and do not reflect those of the funding or data sources; no endorsement is intended or should be inferred. This work was supported by grants from the Natural Sciences and Engineering Research Council of Canada to GH and FR, a Vanier Canada Graduate Scholarship to Moh A. FR is supported by a Canadian Institute for Advanced Research Chair in Artificial Intelligence.</p>
    </ack>
    <fn-group>
      <fn fn-type="con">
        <p>Moh A and Mou A designed the statistical experiments. Moh A programed the experiments. Moh A, Mou A, GH, and FR wrote the paper. Moh A and FR formulated the original problem. GH provided direction and guidance.</p>
      </fn>
      <fn fn-type="conflict">
        <p>None declared.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Gehrmann</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Dernoncourt</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Carlson</surname>
              <given-names>ET</given-names>
            </name>
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>JT</given-names>
            </name>
            <name name-style="western">
              <surname>Welt</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Foote</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Moseley</surname>
              <given-names>ET</given-names>
            </name>
            <name name-style="western">
              <surname>Grant</surname>
              <given-names>DW</given-names>
            </name>
            <name name-style="western">
              <surname>Tyler</surname>
              <given-names>PD</given-names>
            </name>
            <name name-style="western">
              <surname>Celi</surname>
              <given-names>LA</given-names>
            </name>
          </person-group>
          <article-title>Comparing deep learning and concept extraction based methods for patient phenotyping from clinical narratives</article-title>
          <source>PLoS One</source>
          <year>2018</year>
          <volume>13</volume>
          <issue>2</issue>
          <fpage>e0192360</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://dx.plos.org/10.1371/journal.pone.0192360"/>
          </comment>
          <pub-id pub-id-type="doi">10.1371/journal.pone.0192360</pub-id>
          <pub-id pub-id-type="medline">29447188</pub-id>
          <pub-id pub-id-type="pii">PONE-D-17-23063</pub-id>
          <pub-id pub-id-type="pmcid">PMC5813927</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Afzal</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Rastegar-Mojarad</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Shen</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Kingsbury</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>A comparison of word embeddings for the biomedical natural language processing</article-title>
          <source>J Biomed Inform</source>
          <year>2018</year>
          <month>11</month>
          <volume>87</volume>
          <fpage>12</fpage>
          <lpage>20</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://linkinghub.elsevier.com/retrieve/pii/S1532-0464(18)30182-5"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.jbi.2018.09.008</pub-id>
          <pub-id pub-id-type="medline">30217670</pub-id>
          <pub-id pub-id-type="pii">S1532-0464(18)30182-5</pub-id>
          <pub-id pub-id-type="pmcid">PMC6585427</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Craig</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Arias</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Gillman</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>Predicting readmission risk from doctors' notes</article-title>
          <source>arXiv</source>
          <year>2017</year>
          <fpage>-</fpage>
          <comment>epub ahead of print (1711.10663)<ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/1711.10663"/></comment>
        </nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Pennington</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Socher</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Manning</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>GloVe: Global Vectors for Word Representation</article-title>
          <source>Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing</source>
          <year>2014</year>
          <conf-name>EMNLP'14</conf-name>
          <conf-date>October 29, 2014</conf-date>
          <conf-loc>Doha, Qatar</conf-loc>
          <pub-id pub-id-type="doi">10.3115/v1/d14-1162</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Mikolov</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Sutskever</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Corrado</surname>
              <given-names>GS</given-names>
            </name>
            <name name-style="western">
              <surname>Dean</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Distributed representations of words and phrases and their compositionality</article-title>
          <source>Adv Neural Inf Process Syst</source>
          <year>2013</year>
          <fpage>2265</fpage>
          <lpage>2273</lpage>
          <comment>
            <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="https://papers.nips.cc/paper/5021-distributed-representations-of-words-and-phrases-and-their-compositionality.pdf"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref6">
        <label>6</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Thomas</surname>
              <given-names>SM</given-names>
            </name>
            <name name-style="western">
              <surname>Mamlin</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Schadow</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>McDonald</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>A successful technique for removing names in pathology reports using an augmented search and replace method</article-title>
          <source>Proc AMIA Symp</source>
          <year>2002</year>
          <fpage>777</fpage>
          <lpage>81</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/12463930"/>
          </comment>
          <pub-id pub-id-type="medline">12463930</pub-id>
          <pub-id pub-id-type="pii">D020002380</pub-id>
          <pub-id pub-id-type="pmcid">PMC2244188</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref7">
        <label>7</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Dernoncourt</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>JY</given-names>
            </name>
            <name name-style="western">
              <surname>Uzuner</surname>
              <given-names>O</given-names>
            </name>
            <name name-style="western">
              <surname>Szolovits</surname>
              <given-names>P</given-names>
            </name>
          </person-group>
          <article-title>De-identification of patient notes with recurrent neural networks</article-title>
          <source>J Am Med Inform Assoc</source>
          <year>2017</year>
          <month>05</month>
          <day>1</day>
          <volume>24</volume>
          <issue>3</issue>
          <fpage>596</fpage>
          <lpage>606</lpage>
          <comment>
            <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="https://academic.oup.com/jamia/article/24/3/596/2769353"/>
          </comment>
          <pub-id pub-id-type="doi">10.1093/jamia/ocw156</pub-id>
          <pub-id pub-id-type="medline">28040687</pub-id>
          <pub-id pub-id-type="pii">ocw156</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref8">
        <label>8</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Johnson</surname>
              <given-names>AE</given-names>
            </name>
            <name name-style="western">
              <surname>Pollard</surname>
              <given-names>TJ</given-names>
            </name>
            <name name-style="western">
              <surname>Shen</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Lehman</surname>
              <given-names>LH</given-names>
            </name>
            <name name-style="western">
              <surname>Feng</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Ghassemi</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Moody</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Szolovits</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Celi</surname>
              <given-names>LA</given-names>
            </name>
            <name name-style="western">
              <surname>Mark</surname>
              <given-names>RG</given-names>
            </name>
          </person-group>
          <article-title>MIMIC-III, a freely accessible critical care database</article-title>
          <source>Sci Data</source>
          <year>2016</year>
          <month>05</month>
          <day>24</day>
          <volume>3</volume>
          <fpage>160035</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/27219127"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/sdata.2016.35</pub-id>
          <pub-id pub-id-type="medline">27219127</pub-id>
          <pub-id pub-id-type="pii">sdata201635</pub-id>
          <pub-id pub-id-type="pmcid">PMC4878278</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref9">
        <label>9</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Sahlgren</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>An Introduction to Random Indexing</article-title>
          <source>7th International Conference on Terminology and Knowledge Engineering</source>
          <year>2005</year>
          <conf-name>TKE'05</conf-name>
          <conf-date>August 17-18, 2005</conf-date>
          <conf-loc>Denmark</conf-loc>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://soda.swedish-ict.se/221/1/RI_intro.pdf"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref10">
        <label>10</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Bengio</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Ducharme</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Vincent</surname>
              <given-names>P</given-names>
            </name>
          </person-group>
          <article-title>A neural probabilistic language model</article-title>
          <source>J Mach Learn Res</source>
          <year>2003</year>
          <volume>3</volume>
          <fpage>1137</fpage>
          <lpage>55</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://dl.acm.org/doi/10.5555/944919.944966"/>
          </comment>
          <pub-id pub-id-type="doi">10.5555/944919.944966</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref11">
        <label>11</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Peters</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Neumann</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Iyyer</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Deep Contextualized Word Representations</article-title>
          <source>Proceedings of the 2018 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies,</source>
          <year>2018</year>
          <conf-name>NAACL-HLT'18</conf-name>
          <conf-date>June 1-6, 2018</conf-date>
          <conf-loc>New Orleans, Louisiana, USA</conf-loc>
          <pub-id pub-id-type="doi">10.18653/v1/n18-1202</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref12">
        <label>12</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Devlin</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Chang</surname>
              <given-names>MW</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Toutanova</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>BERT: Pre-Training of Deep Bidirectional Transformers for Language Understanding</article-title>
          <source>Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies</source>
          <year>2019</year>
          <conf-name>NAACL'19</conf-name>
          <conf-date>February 15, 2019</conf-date>
          <conf-loc>Minneapolis, Minnesota</conf-loc>
          <pub-id pub-id-type="doi">10.18653/v1/N19-1423</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref13">
        <label>13</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Sahlgren</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>The distributional hypothesis</article-title>
          <source>Ital J Linguist</source>
          <year>2008</year>
          <volume>20</volume>
          <fpage>31</fpage>
          <lpage>51</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://www.italian-journal-linguistics.com/italian-journal-of-linguistics-2008/"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref14">
        <label>14</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lai</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>He</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Zhao</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>How to generate a good word embedding</article-title>
          <source>IEEE Intell Syst</source>
          <year>2016</year>
          <month>11</month>
          <volume>31</volume>
          <issue>6</issue>
          <fpage>5</fpage>
          <lpage>14</lpage>
          <pub-id pub-id-type="doi">10.1109/mis.2016.45</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref15">
        <label>15</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Alsentzer</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Murphy</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Boag</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Weng</surname>
              <given-names>WH</given-names>
            </name>
            <name name-style="western">
              <surname>Jin</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Naumann</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>McDermott</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Publicly Available Clinical BERT Embeddings</article-title>
          <source>Proceedings of the 2nd Clinical Natural Language Processing Workshop</source>
          <year>2019</year>
          <conf-name>NLP'19</conf-name>
          <conf-date>June 7, 2019</conf-date>
          <conf-loc>Minneapolis, Minnesota, USA</conf-loc>
          <fpage>72</fpage>
          <lpage>8</lpage>
          <pub-id pub-id-type="doi">10.18653/v1/w19-1909</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref16">
        <label>16</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Huang</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Jaan</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Rajesh</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>ClinicalBERT: modeling clinical notes and predicting hospital readmission</article-title>
          <source>arXiv</source>
          <year>1904</year>
          <fpage>-</fpage>
          <comment>epub ahead of print(1904.05342)<ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/1904.05342"/></comment>
        </nlm-citation>
      </ref>
      <ref id="ref17">
        <label>17</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Peng</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Shankai</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Zhiyong</surname>
              <given-names>L</given-names>
            </name>
          </person-group>
          <article-title>Transfer Learning in Biomedical Natural Language Processing: An Evaluation of BERT and ELMo on Ten Benchmarking Datasets</article-title>
          <source>Proceedings of the 18th BioNLP Workshop and Shared Task</source>
          <year>2019</year>
          <conf-name>BioNLP'19</conf-name>
          <conf-date>August 1, 2019</conf-date>
          <conf-loc>Florence, Italy</conf-loc>
          <fpage>58</fpage>
          <lpage>65</lpage>
          <pub-id pub-id-type="doi">10.18653/v1/w19-5006</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref18">
        <label>18</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Si</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Xu</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Roberts</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>Enhancing clinical concept extraction with contextual embeddings</article-title>
          <source>J Am Med Inform Assoc</source>
          <year>2019</year>
          <month>11</month>
          <day>1</day>
          <volume>26</volume>
          <issue>11</issue>
          <fpage>1297</fpage>
          <lpage>304</lpage>
          <pub-id pub-id-type="doi">10.1093/jamia/ocz096</pub-id>
          <pub-id pub-id-type="medline">31265066</pub-id>
          <pub-id pub-id-type="pii">5527248</pub-id>
          <pub-id pub-id-type="pmcid">PMC6798561</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref19">
        <label>19</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Miller</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Boitnott</surname>
              <given-names>JK</given-names>
            </name>
            <name name-style="western">
              <surname>Moore</surname>
              <given-names>GW</given-names>
            </name>
          </person-group>
          <article-title>Web-based free-text query system for surgical pathology reports with automatic case deidentification</article-title>
          <source>Arch Pathol Lab Med</source>
          <year>2001</year>
          <volume>125</volume>
          <fpage>1011</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://scholar.google.com/scholar_lookup?journal=Arch+Pathol+Lab+Med&#38;title=Web-Based+Free-Text+Query+System+for+Surgical+Pathology+Reports+with+Automatic+Case+Deidentification&#38;author=R+Miller&#38;author=JK+Boitnott&#38;author=GW+Moore&#38;volume=125&#38;publication_year=2001&#38;pages=1011&#38;"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref20">
        <label>20</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kushida</surname>
              <given-names>CA</given-names>
            </name>
            <name name-style="western">
              <surname>Nichols</surname>
              <given-names>DA</given-names>
            </name>
            <name name-style="western">
              <surname>Jadrnicek</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Miller</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Walsh</surname>
              <given-names>JK</given-names>
            </name>
            <name name-style="western">
              <surname>Griffin</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>Strategies for de-identification and anonymization of electronic health record data for use in multicenter research studies</article-title>
          <source>Med Care</source>
          <year>2012</year>
          <month>07</month>
          <volume>50</volume>
          <issue>Suppl</issue>
          <fpage>S82</fpage>
          <lpage>101</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/22692265"/>
          </comment>
          <pub-id pub-id-type="doi">10.1097/MLR.0b013e3182585355</pub-id>
          <pub-id pub-id-type="medline">22692265</pub-id>
          <pub-id pub-id-type="pii">00005650-201207001-00017</pub-id>
          <pub-id pub-id-type="pmcid">PMC6502465</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref21">
        <label>21</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Neamatullah</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Douglass</surname>
              <given-names>MM</given-names>
            </name>
            <name name-style="western">
              <surname>Lehman</surname>
              <given-names>LH</given-names>
            </name>
            <name name-style="western">
              <surname>Reisner</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Villarroel</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Long</surname>
              <given-names>WJ</given-names>
            </name>
            <name name-style="western">
              <surname>Szolovits</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Moody</surname>
              <given-names>GB</given-names>
            </name>
            <name name-style="western">
              <surname>Mark</surname>
              <given-names>RG</given-names>
            </name>
            <name name-style="western">
              <surname>Clifford</surname>
              <given-names>GD</given-names>
            </name>
          </person-group>
          <article-title>Automated de-identification of free-text medical records</article-title>
          <source>BMC Med Inform Decis Mak</source>
          <year>2008</year>
          <month>07</month>
          <day>24</day>
          <volume>8</volume>
          <fpage>32</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://bmcmedinformdecismak.biomedcentral.com/articles/10.1186/1472-6947-8-32"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/1472-6947-8-32</pub-id>
          <pub-id pub-id-type="medline">18652655</pub-id>
          <pub-id pub-id-type="pii">1472-6947-8-32</pub-id>
          <pub-id pub-id-type="pmcid">PMC2526997</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref22">
        <label>22</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Schakel</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Wilson</surname>
              <given-names>B</given-names>
            </name>
          </person-group>
          <article-title>Measuring word significance using distributed representations of words</article-title>
          <source>arXiv</source>
          <year>2015</year>
          <fpage>-</fpage>
          <comment>epub ahead of print (1508.02297)<ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/1508.02297"/></comment>
        </nlm-citation>
      </ref>
      <ref id="ref23">
        <label>23</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Holm</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>A simple sequentially rejective multiple test procedure</article-title>
          <source>Scand J Stat</source>
          <year>1979</year>
          <fpage>65</fpage>
          <lpage>70</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.jstor.org/stable/4615733"/>
          </comment>
        </nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
