<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article" dtd-version="2.0">
    <front>
        <journal-meta>
            <journal-id journal-id-type="publisher-id">JMIR</journal-id>
            <journal-id journal-id-type="nlm-ta">J Med Internet Res</journal-id>
            <journal-title>Journal of Medical Internet Research</journal-title>
            <issn pub-type="epub">1438-8871</issn>
            <publisher>
                <publisher-name>Gunther Eysenbach</publisher-name>
                <publisher-loc>JMIR Publications Inc., Toronto, Canada</publisher-loc>
            </publisher>
        </journal-meta>
        <article-meta>
            <article-id pub-id-type="publisher-id">v14i4e95</article-id>
            <article-id pub-id-type="pmid">22776692</article-id>
            <article-id pub-id-type="doi">10.2196/jmir.1898</article-id>
            <article-categories>
                <subj-group subj-group-type="article-type">
                    <subject>Original Paper</subject>
                </subj-group>
            </article-categories>
            <title-group>
                <article-title>P2P Watch: Personal Health Information Detection in Peer-to-Peer File-Sharing Networks</article-title>
            </title-group>
            <contrib-group>
                <contrib contrib-type="editor">
                    <name>
                        <surname>Eysenbach</surname>
                        <given-names>Gunther</given-names>
                    </name>
                </contrib>
            </contrib-group>
            <contrib-group>
                <contrib contrib-type="reviewer">
                    <name>
                        <surname>Perez-Rey</surname>
                        <given-names>David</given-names>
                    </name>
                </contrib>
                <contrib contrib-type="reviewer">
                    <name>
                        <surname>Doing-Harris</surname>
                        <given-names>Kristina</given-names>
                    </name>
                </contrib>
            </contrib-group>
            <contrib-group>
                <contrib contrib-type="author" id="contrib1" corresp="yes">
                    <name name-style="western">
                        <surname>Sokolova</surname>
                        <given-names>Marina</given-names>
                    </name>
                    <degrees>MSc, PhD</degrees>
                    <xref ref-type="aff" rid="aff1">1</xref>
                    <address>
                        <institution>Electronic Health Information Laboratory</institution>
                        <institution>CHEO Research Institute</institution>
                        <addr-line>401 Smyth Rd</addr-line>
                        <addr-line>Ottawa, ON, K1H 8L1</addr-line>
                        <country>Canada</country>
                        <phone>1 613 737 7600 ext 4104</phone>
                        <fax>1 613 731 1374</fax>
                        <email>sokolova@uottawa.ca</email>
                    </address>
                    <xref ref-type="aff" rid="aff2">2</xref>
                    <xref ref-type="aff" rid="aff3">3</xref>
                </contrib>
                <contrib contrib-type="author" id="contrib2">
                    <name name-style="western">
                        <surname>El Emam</surname>
                        <given-names>Khaled</given-names>
                    </name>
                    <degrees>PhD</degrees>
                    <xref ref-type="aff" rid="aff1">1</xref>
                    <xref ref-type="aff" rid="aff2">2</xref>
                </contrib>
                <contrib contrib-type="author" id="contrib3">
                    <name name-style="western">
                        <surname>Arbuckle</surname>
                        <given-names>Luk</given-names>
                    </name>
                    <degrees>MSc</degrees>
                    <xref ref-type="aff" rid="aff1">1</xref>
                </contrib>
                <contrib contrib-type="author" id="contrib4">
                    <name name-style="western">
                        <surname>Neri</surname>
                        <given-names>Emilio</given-names>
                    </name>
                    <degrees>BEng</degrees>
                    <xref ref-type="aff" rid="aff1">1</xref>
                </contrib>
                <contrib contrib-type="author" id="contrib5">
                    <name name-style="western">
                        <surname>Rose</surname>
                        <given-names>Sean</given-names>
                    </name>
                    <degrees>MSc</degrees>
                    <xref ref-type="aff" rid="aff4">4</xref>
                </contrib>
                <contrib contrib-type="author" id="contrib6">
                    <name name-style="western">
                        <surname>Jonker</surname>
                        <given-names>Elizabeth</given-names>
                    </name>
                    <degrees>BA</degrees>
                    <xref ref-type="aff" rid="aff1">1</xref>
                </contrib>
            </contrib-group>
            <aff id="aff1" rid="aff1">
                <sup>1</sup>
                <institution>Electronic Health Information Laboratory</institution>
                <institution>CHEO Research Institute</institution>
                <addr-line>Ottawa, ON</addr-line>
                <country>Canada</country>
            </aff>
            <aff id="aff2" rid="aff2">
                <sup>2</sup>
                <institution>Department of Pediatrics</institution>
                <institution>Faculty of Medicine</institution>
                <institution>University of Ottawa</institution>
                <addr-line>Ottawa, ON</addr-line>
                <country>Canada</country>
            </aff>
            <aff id="aff3" rid="aff3">
                <sup>3</sup>
                <institution>Epidemiology and Community Medicine</institution>
                <institution>Faculty of Medicine</institution>
                <institution>University of Ottawa</institution>
                <addr-line>Ottawa, ON</addr-line>
                <country>Canada</country>
            </aff>
            <aff id="aff4" rid="aff4">
                <sup>4</sup>
                <institution>Privacy Analytics</institution>
                <addr-line>Ottawa, ON</addr-line>
                <country>Canada</country>
            </aff>
            <pub-date pub-type="collection">
                <season>May-Jun</season>
                <year>2012</year>
            </pub-date>
            <pub-date pub-type="epub">
                <day>09</day>
                <month>07</month>
                <year>2012</year>
            </pub-date>
            <volume>14</volume>
            <issue>4</issue>
            <elocation-id>e95</elocation-id>
            <!--history from ojs - api-xml-->
            <history>
                <date date-type="received">
                    <day>27</day>
                    <month>07</month>
                    <year>2011</year>
                </date>
                <date date-type="rev-request">
                    <day>17</day>
                    <month>11</month>
                    <year>2011</year>
                </date>
                <date date-type="rev-recd">
                    <day>05</day>
                    <month>03</month>
                    <year>2012</year>
                </date>
                <date date-type="accepted">
                    <day>23</day>
                    <month>04</month>
                    <year>2012</year>
                </date>
            </history>
            <!--(c) the authors - correct author names and publication date here if necessary. Date in form ', dd.mm.yyyy' after jmir.org-->
            <copyright-statement>&#169;Marina Sokolova, Khaled El Emam, Luk Arbuckle, Emilio Neri, Sean Rose, Elizabeth Jonker. Originally published in the Journal of Medical Internet Research (http://www.jmir.org), 09.07.2012. </copyright-statement>
            <copyright-year>2012</copyright-year>
            <license license-type="open-access" xlink:href="http://creativecommons.org/licenses/by/2.0/">
                <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (http://creativecommons.org/licenses/by/2.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in the Journal of Medical Internet Research, is properly cited. The complete bibliographic information, a link to the original publication on http://www.jmir.org/, as well as this copyright and license information must be included.</p>
            </license>
            <self-uri xlink:href="http://www.jmir.org/2012/4/e95/" xlink:type="simple" />
            <abstract>
                <sec sec-type="background">
                    <title>Background</title>
                    <p>Users of peer-to-peer (P2P) file-sharing networks risk the inadvertent disclosure of personal health information (PHI). In addition to potentially causing harm to the affected individuals, this can heighten the risk of data breaches for health information custodians. Automated PHI detection tools that crawl the P2P networks can identify PHI and alert custodians. While there has been previous work on the detection of personal information in electronic health records, there has been a dearth of research on the automated detection of PHI in heterogeneous user files.</p>
                </sec>
                <sec sec-type="objective">
                    <title>Objective</title>
                    <p>To build a system that accurately detects PHI in files sent through P2P file-sharing networks. The system, which we call P2P Watch, uses a pipeline of text processing techniques to automatically detect PHI in files exchanged through P2P networks. P2P Watch processes unstructured texts regardless of the file format, document type, and content.</p>
                </sec>
                <sec sec-type="methods">
                    <title>Methods</title>
                    <p>We developed P2P Watch to extract and analyze PHI in text files exchanged on P2P networks. We labeled texts as PHI if they contained identifiable information about a person (eg, name and date of birth) and specifics of the person&#8217;s health (eg, diagnosis, prescriptions, and medical procedures). We evaluated the system&#8217;s performance through its efficiency and effectiveness on 3924 files gathered from three P2P networks.</p>
                </sec>
                <sec sec-type="results">
                    <title>Results</title>
                    <p>P2P Watch successfully processed 3924 P2P files of unknown content. A manual examination of 1578 randomly selected files marked by the system as non-PHI confirmed that these files indeed did not contain PHI, making the false-negative detection rate equal to zero. Of 57 files marked by the system as PHI, all contained both personally identifiable information and health information: 11 files were PHI disclosures, and 46 files contained organizational materials such as unfilled insurance forms, job applications by medical professionals, and essays.</p>
                </sec>
                <sec sec-type="conclusions">
                    <title>Conclusions</title>
                    <p>PHI can be successfully detected in free-form textual files exchanged through P2P networks. Once the files with PHI are detected, affected individuals or data custodians can be alerted to take remedial action.</p>
                </sec>
            </abstract>
            <kwd-group>
                <kwd>Privacy</kwd>
                <kwd>personal health information</kwd>
                <kwd>natural language processing, text data mining</kwd>
            </kwd-group>
        </article-meta>
    </front>
    <body>
        <sec sec-type="introduction">
            <title>Introduction</title>
            <p>Evidence shows that files sent through peer-to-peer (P2P) file-sharing networks can disclose an individual&#8217;s personal health information (PHI) to millions of network users. PHI refers to information about one&#8217;s health that can be discussed in a clinical setting [<xref ref-type="bibr" rid="ref1">1</xref>]. For example, in more than 3000 files exchanged on P2P networks, 5% contained either sensitive or sufficient information to commit medical identity theft, sometimes for thousands of individuals [<xref ref-type="bibr" rid="ref2">2</xref>]. In another study, the authors semimanually examined 859 files gathered from two P2P networks and found that 8 (1%) files contained PHI [<xref ref-type="bibr" rid="ref3">3</xref>]. Although the disclosure numbers look comparatively small, files on P2P networks are accessible to millions of network users. The same study also showed that the P2P network users may not even be aware that the files can be read by all the peers.</p>
            <p>P2P files may be in various media (eg, visual, audio, and text), may address various topics (eg, fashion, tax report, and family life), and may be written in any language (eg, Spanish and French). To effectively deal with such challenges, automated PHI detection must perform well on multiple tasks, such as language identification, filtering out of damaged and virus files, text extraction, and hierarchical multiclass classification of documents. At the same time, the volumes of files exchanged through P2P networks and expectations of privacy of personal communications make manual PHI detection impossible. While there are several traditional PHI detection tools, they are not suitable for the large-volume analysis of heterogeneous documents. For example, some of these tools are designed to work with semistructured electronic health records and to find personal identifiers, such as patients&#8217; and doctors&#8217; names, insurance parameters, and hospital and clinic names [<xref ref-type="bibr" rid="ref4">4</xref>-<xref ref-type="bibr" rid="ref10">10</xref>].</p>
            <p>In this paper, we describe and evaluate a new system&#8212;P2P Watch&#8212;that has been constructed specifically to crawl through P2P networks and automatically detect whether a retrieved file contains PHI. To be defined as PHI, a P2P file must contain information that would provide someone with the ability to identify a unique individual as well as health information on that individual, such as procedures or drugs. For example, generic statements such as &#8220;John Smith caught a cold&#8221; would be rejected as PHI according to our definition, unless they are reinforced by Smith&#8217;s residential or work address and the prescription drugs he is taking.</p>
            <p>We empirically evaluated our system on three networks: FastTrack, Gnutella, and eD2K. These were chosen due to their global popularity and high share of users. We harvested 3924 files and applied P2P Watch on the file contents. No author metadata was used in the file analysis.</p>
            <p>We concentrate on PHI for Canadians. For example, our syntactic patterns that detect provincial health care numbers and the organization types are adjusted for Canada. Although P2P Watch focuses on Canadian PHI, at the same time, it recognizes geographic locations and zip codes in the United States because Canadians may have been born elsewhere or the PHI may concern a trip.</p>
            <p>P2P Watch provides a mechanism for data custodians and individuals to determine whether information about their patients, employees, or themselves is being exposed. To minimize potential organizational and individual harm from such inappropriate disclosures, automated PHI detection tools can crawl through P2P networks looking for PHI. Once PHI is detected, affected individuals and data custodians can be alerted to take remedial action.</p>
            <p>PHI disclosure on P2P networks is part of a wider trend of PHI presence on the Web [<xref ref-type="bibr" rid="ref11">11</xref>-<xref ref-type="bibr" rid="ref14">14</xref>]. PHI appears in electronic news, blogs by health care professionals and military personnel, Web-posted user messages, medical student papers, and personal letters [<xref ref-type="bibr" rid="ref15">15</xref>-<xref ref-type="bibr" rid="ref23">23</xref>]. For example, Doing-Harris and Zeng-Treitler [<xref ref-type="bibr" rid="ref15">15</xref>] extracted health-related terms from messages posted on PatientsLikeMe.com. They manually evaluated the used vocabulary and found 651 health terms that were not yet included in a medical thesaurus. Another study analyzed user requests posted on an involuntary childlessness message board [<xref ref-type="bibr" rid="ref16">16</xref>]. Blogs written by military servicemen were examined to find descriptions of clinically relevant combat exposure [<xref ref-type="bibr" rid="ref17">17</xref>]. Lampos and Christianini [<xref ref-type="bibr" rid="ref18">18</xref>] used Wikipedia&#8217;s page on influenza and the UK&#8217;s National Health Service website for automated extraction of influenza-like illness markers. They subsequently used the markers to find H1N1-related tweets but did not extract personally identifiable information (PII) about the users. Sokolova et al [<xref ref-type="bibr" rid="ref19">19</xref>] presented a method of patient-based health information extraction from P2P files. They manually evaluated extraction accuracy on 2000 P2P files.</p>
            <p>Interest in PHI published on the Web is ongoing [<xref ref-type="bibr" rid="ref20">20</xref>]. Although openness and information sharing are beneficial to the population at large, users may not be aware of the secondary use of their information and consequent privacy issues [<xref ref-type="bibr" rid="ref21">21</xref>]. A qualitative study of 123 user comments on the online community PatientsLikeMe was dedicated to analysis of sharing PHI among people with similar ailments [<xref ref-type="bibr" rid="ref22">22</xref>]. Chou et al [<xref ref-type="bibr" rid="ref23">23</xref>] identified younger users, those with poorer subjective health, and those with a personal cancer experience as more likely participants in online support groups and more willing to share their PHI.</p>
            <p>So far, PHI detection tools have been developed and deployed by health care organizations in the context of de-identifying the organization&#8217;s records, such as clinical discharge summaries, nurses&#8217; notes, and pathology reports. The main de-identification approach was to classify individual words as presenting personally identifiable information (PII) or not [<xref ref-type="bibr" rid="ref4">4</xref>-<xref ref-type="bibr" rid="ref10">10</xref>]. Such approaches require a substantial amount of labeled training data (eg, 1000 documents [<xref ref-type="bibr" rid="ref7">7</xref>]) and consume considerable processing time.</p>
            <p>In electronic health records, PHI detection can be boosted by the use of the personal information found in the structured part of the document or by pulling in structured information from the medical record database. Customized dictionaries present another source of accuracy in detecting PHI&#8212;these include local geographic names, health care organizations, and patient names [<xref ref-type="bibr" rid="ref24">24</xref>]. These tools also can determine with certainty that there is health information in the documents they analyze and therefore focus only on the detection of PII. We provide evidence that such tools can fail to identify PHI in free-form textual files.</p>
        </sec>
        <sec sec-type="methods">
            <title>Methods</title>
            <p>We designed and implemented P2P Watch, which automatically detects P2P network files that contain PHI.</p>
            <sec>
                <title>System Architecture</title>
                <p>The system was designed as a pipeline of seven components: (1) duplicate file removal, (2) media content removal, (3) text extractor, (4) language identifier, (5) publishable content identifier, (6) PII detector, and (7) patient-oriented health information detector. Components 1&#8211;4 identify and filter out irrelevant files through a shallow analysis, component 5 finds irrelevant files by applying partial content analysis, and components 6 and 7 identify relevant files within the remaining set. At each stage files may be discarded, resulting in fewer and fewer files making it through the pipeline.</p>
                <p>
                    <xref ref-type="fig" rid="figure1">Figure 1</xref> presents the system design.</p>
                <sec>
                    <title>Duplicate Removal</title>
                    <p>The first task of the file processing was to find and remove multiple copies of the same file; such duplicates can happen, as the same file can be harvested from multiple users. For each pair of files, we compared their sizes (in kilobytes), titles, and first and last sentences. If all the parameters were the same, we tagged two files as duplicates and kept only one for further processing.</p>
                </sec>
                <sec>
                    <title>Media Content Removal</title>
                    <p>We assumed that any published text was not leaking PHI&#8212;for example, writings describing fictional characters, and magazine and newspaper articles that contain information that is already public. We used Amazon Web Services as a source database of publication titles [<xref ref-type="bibr" rid="ref25">25</xref>]. Although the number of titles fluctuates almost daily, the database has 400,000 to 500,000 titles for books; recording companies such as Sony, EMI, and Universal have more than 250,000 music titles in the database. Files with exact matching titles were discarded. Exceptions were made for files with titles that included such words as <italic>notification</italic>, <italic>affidavit</italic>, <italic>justice</italic>, <italic>discharge</italic>, and <italic>lab</italic>. Our system did not discard these files and retained them for further processing. If there was no exact title matching, the file was passed on for further processing.</p>
                </sec>
                <sec>
                    <title>Text Extractor</title>
                    <p>The text extractor converted files into raw text by removing all formatting meta-information. It also discarded nontext files (eg, images and music), corrupted files, and viruses. A wide range of input file formats suggested the use of several tools, with each tool extracting text from specific formats. We used the open source program Antiword to extract text from Microsoft Word documents [<xref ref-type="bibr" rid="ref26">26</xref>]. To extract text from PDF, RTF, HTML, or XML files, we used the open source programs MineText [<xref ref-type="bibr" rid="ref27">27</xref>] and GetText [<xref ref-type="bibr" rid="ref28">28</xref>].</p>
                </sec>
                <sec>
                    <title>Language Identifier</title>
                    <p>We differentiated between English-language texts and texts written in other languages. We applied a publicly available language identifier, TextCat Language Guesser, which can identify 69 languages [<xref ref-type="bibr" rid="ref29">29</xref>]. For text, the tool outputs several possible languages. If English was the most likely language of the text, then it appeared at the beginning of the output. Our manual examination had shown that, in our sample data, the first English tag always correctly marked texts written in English. We discarded a file if English was not the most likely language of the text.</p>
                </sec>
                <sec>
                    <title>Publishable Content Removal</title>
                    <p>P2P Watch looked for files with nonpersonal content. It filtered out published and educational materials (eg, assignments and theses) and nonpersonal texts (eg, manuals and technical reports) that were not found by the title lookup. We also hypothesized that music lyrics, discussion of popular fictional characters or current political events, and advertisement would be unlikely candidates for leaking explicit, detailed PHI. We built a list of fictional characters (eg, Bart Simpson), celebrities (eg, Paris Hilton), and public figures (eg, George Bush). We considered that, by the nature of their occupations, celebrities and public figures would have a lot of information about them publicly known and therefore any PHI pertaining to these individuals would not be considered a breach. To perform this task, we built a list of terms that appeared in the preface of publishable and educational texts; the terms are listed in <xref ref-type="app" rid="app1">Multimedia Appendix 1</xref>. We used regular expressions to locate those terms and their variations in the first 200 words of the file. <xref ref-type="table" rid="table1">Table 1</xref> lists word categories and examples.</p>
                    <fig id="figure1" position="float">
                        <label>Figure 1</label>
                        <caption>
                            <p>Components of P2P Watch. P2P = peer-to-peer, PHI = personal health information.</p>
                        </caption>
                        <graphic xlink:href="jmir_v14i4e95_fig1.PNG" alt-version="no" mimetype="image" position="float" xlink:type="simple" />
                    </fig>
                    <table-wrap id="table1" position="float">
                        <label>Table 1</label>
                        <caption>
                            <p>Publishable content identifiers.</p>
                        </caption>
                        <table cellpadding="8" cellspacing="0" border="1" rules="groups" frame="hsides" width="1000">
                            <col width="305" />
                            <col width="695" />
                            <thead>
                                <tr valign="top">
                                    <td>  Category  </td>
                                    <td>  Example  </td>
                                </tr>
                            </thead>
                            <tbody>
                                <tr valign="top">
                                    <td>  Books  </td>
                                    <td>  Ebook, ISBN  </td>
                                </tr>
                                <tr valign="top">
                                    <td>  Education  </td>
                                    <td>  Thesis, assignment  </td>
                                </tr>
                                <tr valign="top">
                                    <td>  Retail  </td>
                                    <td>  Tim Hortons  </td>
                                </tr>
                                <tr valign="top">
                                    <td>  Periodical  </td>
                                    <td>  Magazine, article  </td>
                                </tr>
                                <tr valign="top">
                                    <td>  Fictional  </td>
                                    <td>  Harry Porter  </td>
                                </tr>
                                <tr valign="top">
                                    <td>  Politics  </td>
                                    <td>  Nicolas Sarkozy  </td>
                                </tr>
                            </tbody>
                        </table>
                    </table-wrap>
                </sec>
                <sec>
                    <title>PII Detector</title>
                    <p>We considered that a person can be identified from first and last names, addresses, dates (which can be linked to identifiable events such as birth, death, and marriage), and organization names (eg, school, church, and professional association). We divided PII into three categories: person information (eg, names, family relations, and age-defining events), structured information (eg, dates, telephone numbers, and email), and geographic location (eg, street address and organization). For instance, telephone numbers are both geographic identifiers and numeric identifiers. <xref ref-type="fig" rid="figure2">Figure 2</xref> illustrates the category relations.</p>
                    <p>For the personal name lookup, we acquired a female first name list, a male first name list, and a last name list. The lists contained both formal and informal name forms (eg, William, Bill, and Billy) and non-Anglo-Saxon names (eg, Meehai and Leila), as well as common misspellings (eg, Bll). The lists came with a commercial database marketing tool that performed best in an independent evaluation [<xref ref-type="bibr" rid="ref30">30</xref>].</p>
                    <p>To reduce computationally expensive person name lookups, we first searched for patterns of family relations in the text (eg, &#8220;my daughter&#8221; and &#8220;an uncle of&#8221;), self-identification (eg, &#8220;my name&#8221; and &#8220;sincerely&#8221;), or life event (eg, &#8220;was born&#8221; and &#8220;died in&#8221;). Depending on the patterns, either the preceding or the following capitalized words were stored in a file&#8217;s name list. Having a file&#8217;s name list considerably accelerated the file processing. Further, when P2P Watch checked for a person name, it first checked with the file&#8217;s name list.</p>
                    <p>Additionally, PII included standardized information as follows: (1) telephone numbers: we looked for complete and incomplete formats, used in North America, (2) health insurance numbers: we looked for health insurance numbers assigned by each province (Canada), (3) dates: we restricted the dates to the 20th and 21st centuries, as earlier dates are unlikely to be related to health information of living human beings; also, dates had to be specific: &#8220;March 9th, 1999&#8221; indicated a specific date, whereas &#8220;March was chilly&#8221; did not, (4) email address: for example, john@canada.ca, john AT Canada DOT ca, (5) postal codes in Canada and zip code in the United States. These five categories were retrieved by manually built soft regular expressions.</p>
                    <p>Apart from geographic locations in Canada and the United States, we used some international information and introduced different granularity for different geographic categories. First was country: all the UN-recognized countries and their capitals (eg, France and Paris; Liberia and Monrovia) and self-proclaimed entities (eg, Eritrea and Abkhazia). Second was place. In the United States, we used state name, state capital, and&#8212;to cover the biggest single population unit&#8212;the largest city in the state; for example, we had Illinois, Springfield, and Chicago. In Canada, we used province, provincial capital, largest cities, and tourist attractions (eg, Alberta, Edmonton, Calgary, Banff), and the same for territories. In Europe, Latin America, Asia, Africa, and Australia, we used major cities that are not national capitals. The list is given in <xref ref-type="app" rid="app2">Multimedia Appendix 2</xref>.</p>
                    <p>The Canadian Judicial Council further considers certain organizations to be part of PII [<xref ref-type="bibr" rid="ref31">31</xref>], such as public institutions (eg, schools and churches), care providers (eg, aid societies and foster homes), the names of support organizations (eg, women&#8217;s and senior&#8217;s support centers), and other location identifiers (eg, educational institutions and military bases). The organization names were expressed in many language forms. We modeled language patterns (eg, &#8220;lived in&#8221; and &#8220;come from&#8221;), organization types (eg, schools, military units, and churches), and the target population (eg, youth, women, and seniors).</p>
                    <p>To be marked as a text with PII, the file had to contain a geographic identifier and two other personal identifiers, such as a person&#8217;s first name and last name, or a person&#8217;s first name and date of birth. All the files marked as PII were passed on to the last component.</p>
                    <fig id="figure2" position="float">
                        <label>Figure 2</label>
                        <caption>
                            <p>Personally identifiable information (PII) categories and their subcategories. ID = identification.</p>
                        </caption>
                        <graphic xlink:href="jmir_v14i4e95_fig2.jpg" alt-version="no" mimetype="image" position="float" xlink:type="simple" />
                    </fig>
                </sec>
                <sec>
                    <title>Patient-Oriented Health Information Detector</title>
                    <p>Disease names (eg, arthritis and mumps) and symptoms (eg, chest pain and headache), and procedures (eg, heart surgery and x-rays) most directly convey health information that is usually discussed in a clinical setting. To build a list of corresponding terms, we used the International Classification of Diseases [<xref ref-type="bibr" rid="ref32">32</xref>] and the Medical Dictionary for Regulatory Activities (MedDRA) [<xref ref-type="bibr" rid="ref33">33</xref>]. Drug names, too, may allow one to infer a specific medical, behavioral, or psychological condition or ailment of another individual. We used the Canadian Drug Product Database (Active and Inactive), which contains the names of drugs approved for use in Canada and previously available drugs [<xref ref-type="bibr" rid="ref34">34</xref>]. To accommodate extraction of various drug names, we obtained a list of generic drug names and the trade names associated with them [<xref ref-type="bibr" rid="ref35">35</xref>]. However, the resources listed above leave some gaps in the detection of health information. The most noticeable are acronyms (eg, ICU), specialties of health care providers (therapist, surgeon), and some condition names (blood pressure, tube fed). To fill the gaps, we manually searched Webster&#8217;s New World Medical Dictionary [<xref ref-type="bibr" rid="ref36">36</xref>]. We minimized the above-mentioned resources by removing unrelated categories (animal diseases, animal drugs). Then the remaining texts were converted to lowercase, punctuation marks and numbers were removed, and stop words (eg, of and when) were eliminated. The list of the resulting keywords is given in <xref ref-type="app" rid="app3">Multimedia Appendix 3</xref>.</p>
                    <p>More details on the method can be found in Sokolova et al [<xref ref-type="bibr" rid="ref19">19</xref>].</p>
                </sec>
            </sec>
            <sec>
                <title>Empirical Evaluation</title>
                <p>Project approval from the Research Ethics Board of Children&#8217;s Hospital of Eastern Ontario was obtained prior to retrieving data from the P2P networks.</p>
                <sec>
                    <title>Files Analyzed</title>
                    <p>The files were gathered from April 2008 to June 2009 from three networks. We selected FastTrack, Gnutella, and eD2K networks due to their global popularity and high share of users [<xref ref-type="bibr" rid="ref2">2</xref>]. To automatically search for and download P2P files, we modified the publicly available Shareaza P2P client [<xref ref-type="bibr" rid="ref37">37</xref>], which is a software package allowing one to connect to multiple P2P networks simultaneously. In-house modifications to Shareaza included changes to the search function and an increase of logging capabilities.</p>
                    <p>We focused on capturing the most popular document formats: Microsoft Word (.doc), raw text (.txt), Rich Text Format (.rtf), Excel (.xls), PowerPoint (.ppt), Portable Document Format (.pdf), Extensible Markup Language (.xml), and HyperText Markup Language (.html). The search function was modified to automatically search for those formats and automatically retrieve the files. Automatic searches were conducted by the code at 15 minute intervals.</p>
                    <p>In total, we have gathered 3924 files. The data were sent for processing as is, without preliminary normalization: we preserved all the initial spelling, capitalization, grammar, and so on.</p>
                </sec>
            </sec>
            <sec>
                <title>Performance Evaluation</title>
                <p>We evaluate the system&#8217;s efficiency through the time (seconds) it took each component to process the related files. Technical specifications of our equipment were as follows: Windows Server 2003 (Microsoft Corporation, Redmond, WA, USA), 3.20 GHz Intel Core i3 processor (Intel Corporation, Santa Clara, CA, USA), 4 GB RAM, and 500 GB SATA hard disk.</p>
            </sec>
            <sec>
                <title>Effectiveness Evaluation</title>
                <p>We evaluated the effectiveness of P2P Watch by the number of PHI files found among the files it discarded as non-PHI (false-negative PHI files) and by the number of PHI files it marked as PHI (true-positive PHI files).</p>
                <p>Exact estimation of true-positive PHI files was feasible due to the small output we expected. However, the exact estimation of false-negative PHI was not feasible due to the large volume of data. To estimate the presence of false-negative PHI files in the discarded files, we employed a sampling technique. The sampling technique is described below.</p>
            </sec>
            <sec>
                <title>Comparison With Other PHI Detection Tools</title>
                <p>Currently deployed PHI tools are designed to analyze electronic health records produced by selected health care organizations [<xref ref-type="bibr" rid="ref4">4</xref>-<xref ref-type="bibr" rid="ref10">10</xref>]. The tools work on the assumption that (1) there are PHI words within each document, and (2) these words belong to a restricted number of categories (eg, local doctors, local hospitals). Most of the tools are proprietary and cannot be easily assessed for comparative evaluation [<xref ref-type="bibr" rid="ref4">4</xref>-<xref ref-type="bibr" rid="ref7">7</xref>,<xref ref-type="bibr" rid="ref9">9</xref>,<xref ref-type="bibr" rid="ref10">10</xref>].</p>
                <p>The open source PHI detection tool De-id is popular among researchers [<xref ref-type="bibr" rid="ref8">8</xref>,<xref ref-type="bibr" rid="ref24">24</xref>]. To ensure a fair tool comparison and adherence to a main content assumption of PHI presence in the text, we applied De-id on P2P files that P2P Watch had been identified as PHI.</p>
            </sec>
            <sec>
                <title>Sample Size to Detect PHI in Discarded Files: False-Negative Evaluation</title>
                <p>Because there were 3924 files in total and we expected most of them not to contain PHI, a manual examination of all of the discarded files would have been exceedingly time consuming. To compute the false-negative rate, we estimated the number of PHI files that could appear in a random sample of P2P files.</p>
                <p>We wanted to determine sample sizes in advance. To obtain a conservative estimate, we decided on multiple sampling, where an individual sample is randomly drawn from a separate group of discarded files. Based on the P2P Watch architecture, we identified the three main groups of discarded files: group 1&#8212;files discarded by Amazon search, text extractor, and language identifier; group 2&#8212;files discarded by the content filter; group 3&#8212;files discarded by the PII detector and health information detector.</p>
                <p>We used a binomial distribution as a model for P2P file data, assuming that either a file contains PHI or it does not [<xref ref-type="bibr" rid="ref38">38</xref>,<xref ref-type="bibr" rid="ref39">39</xref>]. Equation 1 in <xref ref-type="fig" rid="figure3">Figure 3</xref> shows the probability of detecting at least one file with PHI, assuming a binomial distribution, where <italic>&#952; </italic>is the rate of PHI we wished to detect, and <italic>n </italic>is the number of independent samples. Equation 1 can be rearranged as Equation 2 (<xref ref-type="fig" rid="figure3">Figure 3</xref>).</p>
                <p>Previous studies showed similarity in the ratio of PHI files among reviewed P2P files: approximately 1% of all the files contained PHI. Therefore, we used this estimate in Equation 2 to define the sample size for groups 1&#8211;3. For each group of discarded files, to have a 95% chance of detecting PHI when the underlying rate of files with PHI is at least 1%, we needed to sample at least 300 files. For the complete data, we wanted to sample at least 900 files.</p>
                <fig id="figure3" position="float">
                    <label>Figure 3</label>
                    <caption>
                        <p>Equation (1, rearranged in 2) for determining the probability of detecting at least one file containing personal health information.</p>
                    </caption>
                    <graphic xlink:href="jmir_v14i4e95_fig3.png" alt-version="no" mimetype="image" position="float" xlink:type="simple" />
                </fig>
            </sec>
            <sec>
                <title>Special Protocols</title>
                <p>Three special protocols were put in place for this study. First, we expected some files to contain inappropriate or obscene material (eg, pornography). We therefore did not explicitly look through image files (file extensions .gif, .jpg, .psd, .tif, and .bmp). Second, if we discovered any illegal materials (eg, child pornography), we passed that information on to the police. Third, if there were cases of disclosure of particularly sensitive personal information or PHI for a large number of individuals, then we reported them to the appropriate federal or provincial privacy commissioner for follow-up.</p>
            </sec>
        </sec>
        <sec sec-type="results">
            <title>Results</title>
            <p>We applied P2P Watch for PHI detection in 3924 files exchanged on the three P2P networks. The total data set size was 9887 MB. The total processing time was 4132.41 seconds.</p>
            <p>
                <xref ref-type="fig" rid="figure4">Figure 4</xref> illustrates changes in the number of files processed by each component during our empirical evaluation.</p>
            <fig id="figure4" position="float">
                <label>Figure 4</label>
                <caption>
                    <p>File processing steps by P2P Watch. PHI = personal health information, PII = personally identifiable information.</p>
                </caption>
                <graphic xlink:href="jmir_v14i4e95_fig4.png" alt-version="no" mimetype="image" position="float" xlink:type="simple" />
            </fig>
            <sec>
                <title>Observations</title>
                <p>We made several general observations during the analysis as follows.</p>
                <sec>
                    <title>Duplicate File Removal</title>
                    <p>We considerably reduced processing time by first testing whether a file was a duplicate of an already gathered file. Removal of duplicates does not affect the quality of PHI detection, as P2P Watch should process the original file. We discarded 514 files (eg, Quick Recipe And Meal Ideas, 36 Christmas Carols &#38; Songs), mostly books and technical manuals; the size of discarded files was 786 MB.</p>
                </sec>
                <sec>
                    <title>Media Content Removal</title>
                    <p>We discarded 286 files, which contained content that was found in the Amazon.com database (eg, Iron Crypt of the Heretics; The Haunted Lighthouse; New York, New York; ABBA&#8217;s The Winner Takes It All; and The Abominable Snowman); the size of discarded files was 1234 MB. We requested the exact match of the file title with the database entry; hence, some files with published content were passed through this filter.</p>
                </sec>
                <sec>
                    <title>Text Extractor</title>
                    <p>We worked with different file formats to extract text and discard images, virus files, and corrupted files. At this stage, 673 files were discarded, mostly mp3 and rm files with music and video contents; the size of the discarded files was 3077 MB.</p>
                </sec>
                <sec>
                    <title>Language Identifier</title>
                    <p>We identified English-language texts among the remaining files; 882 non-English texts were discarded, with a total size of 10 MB. To avoid misrepresentation of North America as a polyglot continent, we checked the discarded files and found that many of them had explicitly erotic content.</p>
                </sec>
                <sec>
                    <title>Publishable Content Identifier</title>
                    <p>Based on processing of the first 200 words, 880 files were discarded, with a total size of 171 MB. Books composed the majority of the discarded files, with the remaining part comprising cover letters, resumes, homework, and forms.</p>
                </sec>
                <sec>
                    <title>Personally Identifiable Information Detector</title>
                    <p>Based on the search for PII within a complete file, 550 files were discarded. Most of these texts were in a small publishable form such as articles, opinion pieces, local community letters, forms, and job application packages. The size of discarded files was 3.9 MB.</p>
                </sec>
                <sec>
                    <title>Patient-Oriented Health Information Detector</title>
                    <p>The last component identified 57 files as potential PHI and discarded 240 files. Among the discarded files, most promoted different types of consumer goods and services (credit services, fitness and skin care, gadgets, etc); some tax and insurance forms were discarded as well.</p>
                </sec>
            </sec>
            <sec>
                <title>Efficiency Evaluation</title>
                <p>We timed the performance of each component. The most time was used by the text extractor, the first component to process the complete file text. The most efficient components were duplicate removal and Amazon.com search, which processed the most files in the least time. <xref ref-type="fig" rid="figure5">Figure 5</xref> illustrates time spent by each component on the data processing.</p>
                <fig id="figure5" position="float">
                    <label>Figure 5</label>
                    <caption>
                        <p>Processing time (seconds) of P2P Watch components.</p>
                    </caption>
                    <graphic xlink:href="jmir_v14i4e95_fig5.jpg" alt-version="no" mimetype="image" position="float" xlink:type="simple" />
                </fig>
            </sec>
            <sec>
                <title>Effectiveness Evaluation: True-Positive Rate</title>
                <p>P2P Watch flagged 57 files as containing PHI. Manual analysis confirmed that all the files contained PII and PHI. However, we distinguished between true PHI and pseudo-PHI. Among these 57 files, 11 contained health information about an identifiable individual: they were indeed PHI (true PHI). PHI appeared in various types of documents&#8212;for example, a note to a temporary guardian and a lawyer&#8217;s note were shown to contain PHI.</p>
                <p>Other marked files contained both PII and PHI, but were not PHI (pseudo-PHI). A few examples of pseudo-PHI discovered through a manual check of the P2P files contained information that the information owner had intentionally allowed to be part of a public audience. In one instance, the data owner stated &#8220;When I was four, I ended up in the hospital for playing with medicine&#8221; as part of a book report on Curious George. Another pseudo-PHI file was a nursing student&#8217;s assignment, which contained initials for a patient at a hospital and a room number, but not enough information to identify the patient without access to more detailed hospital records. The curriculum vitae of a medical professional and a medical insurance form are other examples of such files.</p>
            </sec>
            <sec>
                <title>Effectiveness Evaluation: False-Negative Rate</title>
                <p>We ran P2P Watch without the duplicate removal filter. We invited two independent evaluators to read all the sampled files and mark them as PHI or not. One author (EN) participated in the evaluation of some files.</p>
                <p>We first randomly selected three samples of 300 discarded files, where one sample consisted of files discarded by Amazon search, text extractor, and language identifier; another sample consisted of files discarded by the content filter; and the third sample consisted of files discarded by the PII detector and health information detector.</p>
                <p>None of sampled 900 files contained PHI. We, however, were concerned with the possibility of PHI in files discarded by the PII detector and health information detector. We therefore chose to manually check all the remaining 678 files in this stratum, and found none containing PHI.</p>
                <p>As a result, we manually checked 1578 discarded files and found that none of them contained PHI.</p>
            </sec>
            <sec>
                <title>Comparison With De-id</title>
                <p>The open source tool De-id [<xref ref-type="bibr" rid="ref8">8</xref>] gave us an opportunity to test the applicability of existing tools to finding PHI in P2P files. We applied De-id to find PHI in the 57 files that P2P Watch marked as PHI. By that time, we knew that the files indeed contain PII and PHI. We had to use the text extractor component, as De-id works only with text format.</p>
                <p>De-id crashed on 3 files. The tool did not recognize any of remaining 54 files as PHI. The tool mislabeled many critical terms (eg, <italic>risk </italic>and <italic>blood </italic>were both marked as ambiguous last names, and <italic>disorder </italic>and <italic>depression </italic>were not recognized as health related). De-id took an average of 11 seconds to process a short file.</p>
                <p>This empirical evidence showed that major components have to be added to De-id: a text extractor, and new PII and PHI detection components.</p>
            </sec>
        </sec>
        <sec sec-type="discussion">
            <title>Discussion</title>
            <p>In this study, we have introduced P2P Watch, which detects PHI in files shared by users of P2P networks. Albeit the proportion of PHI files among P2P files is rather small, the overall problem is big, as by some estimates 50% of files downloaded and 80% of files uploaded on the Internet are through P2P networks[<xref ref-type="bibr" rid="ref40">40</xref>]. However, even one PHI file can do much harm, especially if it contains an exact pointer to a publicly available data base. At the same time, we empirically showed that traditional de-identification tools are not designed to detect PHI in P2P files.</p>
            <p>P2P Watch is capable of working within the complex environment of P2P networks. It detects PHI in files in which context, content, and format type vary. Within the data set of 3924 P2P files, the system detected 11 PHI files. Our manual evaluation of 1578 files, marked by the system as non-PHI, confirmed that these files indeed did not contain PHI. The sampling results showed that P2P Watch was very unlikely to miss PHI files.</p>
            <p>For successful PHI detection in P2P networks, it is essential that the detection system process large volumes of heterogeneous data input in a timely manner and can withstand substantial irrelevant information. A reliable solution is based on two factors [<xref ref-type="bibr" rid="ref41">41</xref>]. First, a high confidence that the limited number of analyzed texts will not exclude any possible PHI texts is needed; this can be achieved through filtering out only the files guaranteed not to contain PHI. Second, a speedy detection process is needed that prevents a prolonged presence of PHI texts on the network; this can be achieved through minimizing time of filtering with respect to performed text analysis.</p>
            <p>P2P Watch efficiently reduces the time of PHI exposure. Our detection strategy is up-front shallow text processing, whose goal is to quickly process the vast majority of input files, followed by a thorough analysis of a small number of selected texts. This thorough analysis phase used electronic and hard-copy dictionaries of health care terms, an ontology of medical terms, and lists of personal names. We supplemented those sources with in-house built gazetteers (topical lists of geographic information) and lists of organization types.P2P Watch reserves comprehensive text analysis for a small number of selected files, while performing fast and accurate shallow processing for the vast majority of files. This is the principal difference from previously built systems, which process all the input files equally. The difference may be explained by the fact that previous systems were designed to detect patient&#8217;s PII in electronic medical records, whereas our P2P Watch searches for both PII and PHI in previously unseen documents. Once a file is flagged as containing PHI, the individuals affected can be alerted. A search for a data custodian&#8217;s name within the flagged files would indicate which custodian to alert, for example.</p>
            <p>Several possible expansions of the functionality of P2P Watch are being considered for future work. Our current detection is limited to text written in English. Expanding P2P Watch capacities to other language such as French and Spanish would capture more PHI leaks. Furthermore, we want to build separate components to identify files that contain PII and HI, but are not PHI (pseudo-PHI). Resumes, recipes, incomplete health forms (eg, insurance), and public health announcements are examples of files that were falsely labeled as PHI. The idea would be to detect these types of documents and automatically exclude them early in the P2P Watch pipeline. A challenge with forms is that the empty forms may contain pseudo-PHI (eg, fields for human immunodeficiency test results). Special analysis of such forms is required to determine which content is part of the form and which is completed by the user.</p>
            <p>Another future avenue is to add localized detection of PHI in the United States; this expansion may involve building new customized lists of organization and trademark names. In the future, a deeper analysis phase, perhaps coreference resolution, could be done, potentially increasing the precision of the whole detection process.</p>
        </sec>
    </body>
    <back>
        <app-group>
            <app id="app1">
                <title>Multimedia Appendix 1</title>
                <p>Publishable information keywords.</p>
                <media xlink:href="jmir_v14i4e95_app1.txt" xlink:title="TXT File, 1KB" />
            </app>
            <app id="app2">
                <title>Multimedia Appendix 2</title>
                <p>Geographic locations.</p>
                <media xlink:href="jmir_v14i4e95_app2.txt" xlink:title="TXT File, 6KB" />
            </app>
            <app id="app3">
                <title>Multimedia Appendix 3</title>
                <p>Health information keywords.</p>
                <media xlink:href="jmir_v14i4e95_app3.txt" xlink:title="TXT File, 951KB" />
            </app>
        </app-group>
        <glossary>
            <title>Abbreviations</title>
            <def-list>
                <def-item>
                    <term id="abb1">P2P</term>
                    <def>
                        <p> peer-to-peer</p>
                    </def>
                </def-item>
                <def-item>
                    <term id="abb2">PHI</term>
                    <def>
                        <p> personal health information</p>
                    </def>
                </def-item>
                <def-item>
                    <term id="abb3">PII</term>
                    <def>
                        <p> personally identifiable information</p>
                    </def>
                </def-item>
            </def-list>
        </glossary>
        <ack>
            <p>We thank Ontario Centres of Excellence and the Natural Sciences and Engineering Research Council of Canada for their financial support. We also thank Terry Copeck for his work on the text extractor filter. We thank Sadrul Chowdhury for his work on the system. We thank the manuscript reviewers and editors for thorough and helpful comments.</p>
        </ack>
        <fn-group>
            <fn fn-type="conflict">
                <p>None declared.</p>
            </fn>
        </fn-group>
        <ref-list>
            <ref id="ref1">
                <label>1</label>
                <nlm-citation citation-type="book">
                    <person-group person-group-type="author">
                        <name name-style="western">
                            <surname>Hersh</surname>
                            <given-names>W</given-names>
                        </name>
                    </person-group>
                    <source>Information Retrieval: A HealthBiomedical Perspective. 3rd edition. Hannah K, Ball M.  editors. Health Informatics</source>
                    <year>2009</year>
                    <publisher-loc>New York, NY</publisher-loc>
                    <publisher-name>Springer</publisher-name>
                </nlm-citation>
            </ref>
            <ref id="ref2">
                <label>2</label>
                <nlm-citation citation-type="journal">
                    <person-group person-group-type="author">
                        <name name-style="western">
                            <surname>Johnson</surname>
                            <given-names>E</given-names>
                        </name>
                    </person-group>
                    <article-title>Data hemorrhages in the health-care sector</article-title>
                    <source>Lect Notes Comput Sci</source>
                    <year>2009</year>
                    <volume>5628</volume>
                    <fpage>71</fpage>
                    <lpage>89</lpage>
                    <pub-id pub-id-type="doi">10.1007/978-3-642-03549-4_5</pub-id>
                </nlm-citation>
            </ref>
            <ref id="ref3">
                <label>3</label>
                <nlm-citation citation-type="journal">
                    <person-group person-group-type="author">
                        <name name-style="western">
                            <surname>El Emam</surname>
                            <given-names>K</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Neri</surname>
                            <given-names>E</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Jonker</surname>
                            <given-names>E</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Sokolova</surname>
                            <given-names>M</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Peyton</surname>
                            <given-names>L</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Neisa</surname>
                            <given-names>A</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Scassa</surname>
                            <given-names>T</given-names>
                        </name>
                    </person-group>
                    <article-title>The inadvertent disclosure of personal health information through peer-to-peer file sharing programs</article-title>
                    <source>J Am Med Inform Assoc</source>
                    <year>2010</year>
                    <volume>17</volume>
                    <issue>2</issue>
                    <fpage>148</fpage>
                    <lpage>58</lpage>
                    <comment>
                        <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://jamia.bmj.com/cgi/pmidlookup?view=long&#38;pmid=20190057" />
                    </comment>
                    <pub-id pub-id-type="doi">10.1136/jamia.2009.000232</pub-id>
                    <pub-id pub-id-type="medline">20190057</pub-id>
                    <pub-id pub-id-type="pii">17/2/148</pub-id>
                    <pub-id pub-id-type="pmcid">PMC3000774</pub-id>
                </nlm-citation>
            </ref>
            <ref id="ref4">
                <label>4</label>
                <nlm-citation citation-type="journal">
                    <person-group person-group-type="author">
                        <name name-style="western">
                            <surname>Uzuner</surname>
                            <given-names>O</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Sibanda</surname>
                            <given-names>TC</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Luo</surname>
                            <given-names>Y</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Szolovits</surname>
                            <given-names>P</given-names>
                        </name>
                    </person-group>
                    <article-title>A de-identifier for medical discharge summaries</article-title>
                    <source>Artif Intell Med</source>
                    <year>2008</year>
                    <month>01</month>
                    <volume>42</volume>
                    <issue>1</issue>
                    <fpage>13</fpage>
                    <lpage>35</lpage>
                    <pub-id pub-id-type="doi">10.1016/j.artmed.2007.10.001</pub-id>
                    <pub-id pub-id-type="medline">18053696</pub-id>
                    <pub-id pub-id-type="pii">S0933-3657(07)00132-7</pub-id>
                    <pub-id pub-id-type="pmcid">PMC2271040</pub-id>
                </nlm-citation>
            </ref>
            <ref id="ref5">
                <label>5</label>
                <nlm-citation citation-type="journal">
                    <person-group person-group-type="author">
                        <name name-style="western">
                            <surname>Uzuner</surname>
                            <given-names>O</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Luo</surname>
                            <given-names>Y</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Szolovits</surname>
                            <given-names>P</given-names>
                        </name>
                    </person-group>
                    <article-title>Evaluating the state-of-the-art in automatic de-identification</article-title>
                    <source>J Am Med Inform Assoc</source>
                    <year>2007</year>
                    <volume>14</volume>
                    <issue>5</issue>
                    <fpage>550</fpage>
                    <lpage>63</lpage>
                    <comment>
                        <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://jamia.bmj.com/cgi/pmidlookup?view=long&#38;pmid=17600094" />
                    </comment>
                    <pub-id pub-id-type="doi">10.1197/jamia.M2444</pub-id>
                    <pub-id pub-id-type="medline">17600094</pub-id>
                    <pub-id pub-id-type="pii">M2444</pub-id>
                    <pub-id pub-id-type="pmcid">PMC1975792</pub-id>
                </nlm-citation>
            </ref>
            <ref id="ref6">
                <label>6</label>
                <nlm-citation citation-type="journal">
                    <person-group person-group-type="author">
                        <name name-style="western">
                            <surname>Aberdeen</surname>
                            <given-names>J</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Bayer</surname>
                            <given-names>S</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Yeniterzi</surname>
                            <given-names>R</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Wellner</surname>
                            <given-names>B</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Clark</surname>
                            <given-names>C</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Hanauer</surname>
                            <given-names>D</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Malin</surname>
                            <given-names>B</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Hirschman</surname>
                            <given-names>L</given-names>
                        </name>
                    </person-group>
                    <article-title>The MITRE Identification Scrubber Toolkit: design, training, and assessment</article-title>
                    <source>Int J Med Inform</source>
                    <year>2010</year>
                    <month>12</month>
                    <volume>79</volume>
                    <issue>12</issue>
                    <fpage>849</fpage>
                    <lpage>59</lpage>
                    <pub-id pub-id-type="doi">10.1016/j.ijmedinf.2010.09.007</pub-id>
                    <pub-id pub-id-type="medline">20951082</pub-id>
                    <pub-id pub-id-type="pii">S1386-5056(10)00168-1</pub-id>
                </nlm-citation>
            </ref>
            <ref id="ref7">
                <label>7</label>
                <nlm-citation citation-type="journal">
                    <person-group person-group-type="author">
                        <name name-style="western">
                            <surname>Yeniterzi</surname>
                            <given-names>R</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Aberdeen</surname>
                            <given-names>J</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Bayer</surname>
                            <given-names>S</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Wellner</surname>
                            <given-names>B</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Hirschman</surname>
                            <given-names>L</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Malin</surname>
                            <given-names>B</given-names>
                        </name>
                    </person-group>
                    <article-title>Effects of personal identifier resynthesis on clinical text de-identification</article-title>
                    <source>J Am Med Inform Assoc</source>
                    <year>2010</year>
                    <volume>17</volume>
                    <issue>2</issue>
                    <fpage>159</fpage>
                    <lpage>68</lpage>
                    <comment>
                        <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://jamia.bmj.com/cgi/pmidlookup?view=long&#38;pmid=20190058" />
                    </comment>
                    <pub-id pub-id-type="doi">10.1136/jamia.2009.002212</pub-id>
                    <pub-id pub-id-type="medline">20190058</pub-id>
                    <pub-id pub-id-type="pii">17/2/159</pub-id>
                    <pub-id pub-id-type="pmcid">PMC3000784</pub-id>
                </nlm-citation>
            </ref>
            <ref id="ref8">
                <label>8</label>
                <nlm-citation citation-type="journal">
                    <person-group person-group-type="author">
                        <name name-style="western">
                            <surname>Neamatullah</surname>
                            <given-names>I</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Douglass</surname>
                            <given-names>MM</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Lehman</surname>
                            <given-names>LW</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Reisner</surname>
                            <given-names>A</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Villarroel</surname>
                            <given-names>M</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Long</surname>
                            <given-names>WJ</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Szolovits</surname>
                            <given-names>P</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Moody</surname>
                            <given-names>GB</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Mark</surname>
                            <given-names>RG</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Clifford</surname>
                            <given-names>GD</given-names>
                        </name>
                    </person-group>
                    <article-title>Automated de-identification of free-text medical records</article-title>
                    <source>BMC Med Inform Decis Mak</source>
                    <year>2008</year>
                    <volume>8</volume>
                    <fpage>32</fpage>
                    <comment>
                        <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://www.biomedcentral.com/1472-6947/8/32" />
                    </comment>
                    <pub-id pub-id-type="doi">10.1186/1472-6947-8-32</pub-id>
                    <pub-id pub-id-type="medline">18652655</pub-id>
                    <pub-id pub-id-type="pii">1472-6947-8-32</pub-id>
                    <pub-id pub-id-type="pmcid">PMC2526997</pub-id>
                </nlm-citation>
            </ref>
            <ref id="ref9">
                <label>9</label>
                <nlm-citation citation-type="journal">
                    <person-group person-group-type="author">
                        <name name-style="western">
                            <surname>Wellner</surname>
                            <given-names>B</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Huyck</surname>
                            <given-names>M</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Mardis</surname>
                            <given-names>S</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Aberdeen</surname>
                            <given-names>J</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Morgan</surname>
                            <given-names>A</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Peshkin</surname>
                            <given-names>L</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Yeh</surname>
                            <given-names>A</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Hitzeman</surname>
                            <given-names>J</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Hirschman</surname>
                            <given-names>L</given-names>
                        </name>
                    </person-group>
                    <article-title>Rapidly retargetable approaches to de-identification in medical records</article-title>
                    <source>J Am Med Inform Assoc</source>
                    <year>2007</year>
                    <volume>14</volume>
                    <issue>5</issue>
                    <fpage>564</fpage>
                    <lpage>73</lpage>
                    <comment>
                        <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://jamia.bmj.com/cgi/pmidlookup?view=long&#38;pmid=17600096" />
                    </comment>
                    <pub-id pub-id-type="doi">10.1197/jamia.M2435</pub-id>
                    <pub-id pub-id-type="medline">17600096</pub-id>
                    <pub-id pub-id-type="pii">M2435</pub-id>
                    <pub-id pub-id-type="pmcid">PMC1975794</pub-id>
                </nlm-citation>
            </ref>
            <ref id="ref10">
                <label>10</label>
                <nlm-citation citation-type="journal">
                    <person-group person-group-type="author">
                        <name name-style="western">
                            <surname>Meystre</surname>
                            <given-names>SM</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Friedlin</surname>
                            <given-names>FJ</given-names>
                        </name>
                        <name name-style="western">
                            <surname>South</surname>
                            <given-names>BR</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Shen</surname>
                            <given-names>S</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Samore</surname>
                            <given-names>MH</given-names>
                        </name>
                    </person-group>
                    <article-title>Automatic de-identification of textual documents in the electronic health record: a review of recent research</article-title>
                    <source>BMC Med Res Methodol</source>
                    <year>2010</year>
                    <volume>10</volume>
                    <fpage>70</fpage>
                    <comment>
                        <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://www.biomedcentral.com/1471-2288/10/70" />
                    </comment>
                    <pub-id pub-id-type="doi">10.1186/1471-2288-10-70</pub-id>
                    <pub-id pub-id-type="medline">20678228</pub-id>
                    <pub-id pub-id-type="pii">1471-2288-10-70</pub-id>
                    <pub-id pub-id-type="pmcid">PMC2923159</pub-id>
                </nlm-citation>
            </ref>
            <ref id="ref11">
                <label>11</label>
                <nlm-citation citation-type="journal">
                    <person-group person-group-type="author">
                        <name name-style="western">
                            <surname>Lagu</surname>
                            <given-names>T</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Kaufman</surname>
                            <given-names>EJ</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Asch</surname>
                            <given-names>DA</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Armstrong</surname>
                            <given-names>K</given-names>
                        </name>
                    </person-group>
                    <article-title>Content of weblogs written by health professionals</article-title>
                    <source>J Gen Intern Med</source>
                    <year>2008</year>
                    <month>10</month>
                    <volume>23</volume>
                    <issue>10</issue>
                    <fpage>1642</fpage>
                    <lpage>6</lpage>
                    <pub-id pub-id-type="doi">10.1007/s11606-008-0726-6</pub-id>
                    <pub-id pub-id-type="medline">18649110</pub-id>
                    <pub-id pub-id-type="pmcid">PMC2533366</pub-id>
                </nlm-citation>
            </ref>
            <ref id="ref12">
                <label>12</label>
                <nlm-citation citation-type="journal">
                    <person-group person-group-type="author">
                        <name name-style="western">
                            <surname>Buchanan</surname>
                            <given-names>T</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Joinson</surname>
                            <given-names>A</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Paine</surname>
                            <given-names>C</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Reips</surname>
                            <given-names>UD</given-names>
                        </name>
                    </person-group>
                    <article-title>Looking for medical information on the Internet: self-disclosure, privacy and trust</article-title>
                    <source>Health Inf Internet</source>
                    <year>2007</year>
                    <volume>58</volume>
                    <issue>1</issue>
                    <fpage>8</fpage>
                    <lpage>9</lpage>
                </nlm-citation>
            </ref>
            <ref id="ref13">
                <label>13</label>
                <nlm-citation citation-type="journal">
                    <person-group person-group-type="author">
                        <name name-style="western">
                            <surname>Wright</surname>
                            <given-names>KB</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Bell</surname>
                            <given-names>SB</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Wright</surname>
                            <given-names>KB</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Bell</surname>
                            <given-names>SB</given-names>
                        </name>
                    </person-group>
                    <article-title>Health-related Support Groups on the Internet: Linking Empirical Findings to Social Support and Computer-mediated Communication Theory</article-title>
                    <source>J Health Psychol</source>
                    <year>2003</year>
                    <month>01</month>
                    <volume>8</volume>
                    <issue>1</issue>
                    <fpage>39</fpage>
                    <lpage>54</lpage>
                    <pub-id pub-id-type="doi">10.1177/1359105303008001429</pub-id>
                    <pub-id pub-id-type="medline">22113899</pub-id>
                    <pub-id pub-id-type="pii">8/1/39</pub-id>
                </nlm-citation>
            </ref>
            <ref id="ref14">
                <label>14</label>
                <nlm-citation citation-type="journal">
                    <person-group person-group-type="author">
                        <name name-style="western">
                            <surname>Joinson</surname>
                            <given-names>A</given-names>
                        </name>
                    </person-group>
                    <article-title>Self-disclosure in computer-mediated communication: the role of self-awareness and visual anonymity</article-title>
                    <source>Eur J Soc Psychol</source>
                    <year>2001</year>
                    <volume>31</volume>
                    <fpage>177</fpage>
                    <lpage>92</lpage>
                </nlm-citation>
            </ref>
            <ref id="ref15">
                <label>15</label>
                <nlm-citation citation-type="journal">
                    <person-group person-group-type="author">
                        <name name-style="western">
                            <surname>Doing-Harris</surname>
                            <given-names>KM</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Zeng-Treitler</surname>
                            <given-names>Q</given-names>
                        </name>
                    </person-group>
                    <article-title>Computer-assisted update of a consumer health vocabulary through mining of social network data</article-title>
                    <source>J Med Internet Res</source>
                    <year>2011</year>
                    <volume>13</volume>
                    <issue>2</issue>
                    <fpage>e37</fpage>
                    <comment>
                        <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://www.jmir.org/2011/2/e37/" />
                    </comment>
                    <pub-id pub-id-type="doi">10.2196/jmir.1636</pub-id>
                    <pub-id pub-id-type="medline">21586386</pub-id>
                    <pub-id pub-id-type="pii">v13i2e37</pub-id>
                    <pub-id pub-id-type="pmcid">PMC3221384</pub-id>
                </nlm-citation>
            </ref>
            <ref id="ref16">
                <label>16</label>
                <nlm-citation citation-type="journal">
                    <person-group person-group-type="author">
                        <name name-style="western">
                            <surname>Himmel</surname>
                            <given-names>W</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Reincke</surname>
                            <given-names>U</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Michelmann</surname>
                            <given-names>HW</given-names>
                        </name>
                    </person-group>
                    <article-title>Text mining and natural language processing approaches for automatic categorization of lay requests to web-based expert forums</article-title>
                    <source>J Med Internet Res</source>
                    <year>2009</year>
                    <volume>11</volume>
                    <issue>3</issue>
                    <fpage>e25</fpage>
                    <comment>
                        <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://www.jmir.org/2009/3/e25/" />
                    </comment>
                    <pub-id pub-id-type="doi">10.2196/jmir.1123</pub-id>
                    <pub-id pub-id-type="medline">19632978</pub-id>
                    <pub-id pub-id-type="pii">v11i3e25</pub-id>
                    <pub-id pub-id-type="pmcid">PMC2762848</pub-id>
                </nlm-citation>
            </ref>
            <ref id="ref17">
                <label>17</label>
                <nlm-citation citation-type="journal">
                    <person-group person-group-type="author">
                        <name name-style="western">
                            <surname>Konovalov</surname>
                            <given-names>S</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Scotch</surname>
                            <given-names>M</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Post</surname>
                            <given-names>L</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Brandt</surname>
                            <given-names>C</given-names>
                        </name>
                    </person-group>
                    <article-title>Biomedical informatics techniques for processing and analyzing web blogs of military service members</article-title>
                    <source>J Med Internet Res</source>
                    <year>2010</year>
                    <volume>12</volume>
                    <issue>4</issue>
                    <fpage>e45</fpage>
                    <comment>
                        <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://www.jmir.org/2010/4/e45/" />
                    </comment>
                    <pub-id pub-id-type="doi">10.2196/jmir.1538</pub-id>
                    <pub-id pub-id-type="medline">20923755</pub-id>
                    <pub-id pub-id-type="pii">v12i4e45</pub-id>
                    <pub-id pub-id-type="pmcid">PMC3234168</pub-id>
                </nlm-citation>
            </ref>
            <ref id="ref18">
                <label>18</label>
                <nlm-citation citation-type="confproc">
                    <person-group person-group-type="author">
                        <name name-style="western">
                            <surname>Lampos</surname>
                            <given-names>V</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Christianini</surname>
                            <given-names>N</given-names>
                        </name>
                    </person-group>
                    <article-title>Tracking the flu pandemic by monitoring the social web</article-title>
                    <year>2010</year>
                    <conf-name>2nd Workshop on Cognitive Information Processing</conf-name>
                    <conf-date>Jun 14-16, 2010</conf-date>
                    <conf-loc>Elba Island, Tuscany, Italy</conf-loc>
                </nlm-citation>
            </ref>
            <ref id="ref19">
                <label>19</label>
                <nlm-citation citation-type="confproc">
                    <person-group person-group-type="author">
                        <name name-style="western">
                            <surname>Sokolova</surname>
                            <given-names>M</given-names>
                        </name>
                        <name name-style="western">
                            <surname>El Emam</surname>
                            <given-names>K</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Rose</surname>
                            <given-names>S</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Chowdhury</surname>
                            <given-names>S</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Neri</surname>
                            <given-names>E</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Jonker</surname>
                            <given-names>E</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Peyton</surname>
                            <given-names>L</given-names>
                        </name>
                    </person-group>
                    <article-title>Personal health information leak prevention in heterogeneous texts</article-title>
                    <year>2009</year>
                    <conf-name>Adaptation of Language Resources and Technology to New Domains</conf-name>
                    <conf-date>Sep 17, 2009</conf-date>
                    <conf-loc>Borovets, Bulgaria</conf-loc>
                    <comment>
                        <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://dl.acm.org/citation.cfm?id=1859157" />
                    </comment>
                </nlm-citation>
            </ref>
            <ref id="ref20">
                <label>20</label>
                <nlm-citation citation-type="confproc">
                    <person-group person-group-type="author">
                        <name name-style="western">
                            <surname>Sokolova</surname>
                            <given-names>M</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Schramm</surname>
                            <given-names>D</given-names>
                        </name>
                    </person-group>
                    <article-title>Building a patient-based ontology for mining user-written content</article-title>
                    <year>2011</year>
                    <conf-name>Recent Advances in Natural Language Processing</conf-name>
                    <conf-date>Sep 12-14, 2011</conf-date>
                    <conf-loc>Hissar, Bulgaria</conf-loc>
                    <comment>
                        <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://aclweb.org/anthology-new/R/R11/" />
                    </comment>
                </nlm-citation>
            </ref>
            <ref id="ref21">
                <label>21</label>
                <nlm-citation citation-type="journal">
                    <person-group person-group-type="author">
                        <name name-style="western">
                            <surname>Eysenbach</surname>
                            <given-names>G</given-names>
                        </name>
                    </person-group>
                    <article-title>Medicine 2.0: social networking, collaboration, participation, apomediation, and openness</article-title>
                    <source>J Med Internet Res</source>
                    <year>2008</year>
                    <volume>10</volume>
                    <issue>3</issue>
                    <fpage>e22</fpage>
                    <comment>
                        <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://www.jmir.org/2008/3/e22/" />
                    </comment>
                    <pub-id pub-id-type="doi">10.2196/jmir.1030</pub-id>
                    <pub-id pub-id-type="medline">18725354</pub-id>
                    <pub-id pub-id-type="pii">v10i3e22</pub-id>
                    <pub-id pub-id-type="pmcid">PMC2626430</pub-id>
                </nlm-citation>
            </ref>
            <ref id="ref22">
                <label>22</label>
                <nlm-citation citation-type="journal">
                    <person-group person-group-type="author">
                        <name name-style="western">
                            <surname>Frost</surname>
                            <given-names>JH</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Massagli</surname>
                            <given-names>MP</given-names>
                        </name>
                    </person-group>
                    <article-title>Social uses of personal health information within PatientsLikeMe, an online patient community: what can happen when patients have access to one another's data</article-title>
                    <source>J Med Internet Res</source>
                    <year>2008</year>
                    <volume>10</volume>
                    <issue>3</issue>
                    <fpage>e15</fpage>
                    <comment>
                        <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://www.jmir.org/2008/3/e15/" />
                    </comment>
                    <pub-id pub-id-type="doi">10.2196/jmir.1053</pub-id>
                    <pub-id pub-id-type="medline">18504244</pub-id>
                    <pub-id pub-id-type="pii">v10i3e15</pub-id>
                    <pub-id pub-id-type="pmcid">PMC2553248</pub-id>
                </nlm-citation>
            </ref>
            <ref id="ref23">
                <label>23</label>
                <nlm-citation citation-type="journal">
                    <person-group person-group-type="author">
                        <name name-style="western">
                            <surname>Chou</surname>
                            <given-names>WY</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Hunt</surname>
                            <given-names>YM</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Beckjord</surname>
                            <given-names>EB</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Moser</surname>
                            <given-names>RP</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Hesse</surname>
                            <given-names>BW</given-names>
                        </name>
                    </person-group>
                    <article-title>Social media use in the United States: implications for health communication</article-title>
                    <source>J Med Internet Res</source>
                    <year>2009</year>
                    <month>12</month>
                    <volume>11</volume>
                    <issue>4</issue>
                    <fpage>e48</fpage>
                    <comment>
                        <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://www.jmir.org/2009/4/e48/" />
                    </comment>
                    <pub-id pub-id-type="doi">10.2196/jmir.1249</pub-id>
                    <pub-id pub-id-type="medline">19945947</pub-id>
                    <pub-id pub-id-type="pii">v11i4e48</pub-id>
                    <pub-id pub-id-type="pmcid">PMC2802563</pub-id>
                </nlm-citation>
            </ref>
            <ref id="ref24">
                <label>24</label>
                <nlm-citation citation-type="journal">
                    <person-group person-group-type="author">
                        <name name-style="western">
                            <surname>Tu</surname>
                            <given-names>K</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Klein-Geltink</surname>
                            <given-names>J</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Mitiku</surname>
                            <given-names>TF</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Mihai</surname>
                            <given-names>C</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Martin</surname>
                            <given-names>J</given-names>
                        </name>
                    </person-group>
                    <article-title>De-identification of primary care electronic medical records free-text data in Ontario, Canada</article-title>
                    <source>BMC Med Inform Decis Mak</source>
                    <year>2010</year>
                    <volume>10</volume>
                    <fpage>35</fpage>
                    <comment>
                        <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://www.biomedcentral.com/1472-6947/10/35" />
                    </comment>
                    <pub-id pub-id-type="doi">10.1186/1472-6947-10-35</pub-id>
                    <pub-id pub-id-type="medline">20565894</pub-id>
                    <pub-id pub-id-type="pii">1472-6947-10-35</pub-id>
                    <pub-id pub-id-type="pmcid">PMC2907300</pub-id>
                </nlm-citation>
            </ref>
            <ref id="ref25">
                <label>25</label>
                <nlm-citation citation-type="web">
                    <source>Amazon.com</source>
                    <year>2010</year>
                    <access-date>2010-08-23</access-date>
                    <comment>Amazon Web Services<ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://aws.amazon.com/">http://aws.amazon.com/</ext-link>
                    </comment>
                    <pub-id pub-id-type="other">5sCQ5qVCZ</pub-id>
                </nlm-citation>
            </ref>
            <ref id="ref26">
                <label>26</label>
                <nlm-citation citation-type="web">
                    <person-group person-group-type="author">
                        <name name-style="western">
                            <surname>van Os</surname>
                            <given-names>A</given-names>
                        </name>
                    </person-group>
                    <source>Adri van Os</source>
                    <year>2008</year>
                    <month>04</month>
                    <day>15</day>
                    <access-date>2010-08-23</access-date>
                    <comment>Antiword<ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://www.winfield.demon.nl/">http://www.winfield.demon.nl/</ext-link>
                    </comment>
                    <pub-id pub-id-type="other">5sCQHkVjX</pub-id>
                </nlm-citation>
            </ref>
            <ref id="ref27">
                <label>27</label>
                <nlm-citation citation-type="web">
                    <source>Text-Mining Tool.com</source>
                    <year>2010</year>
                    <access-date>2010-08-23</access-date>
                    <comment>Text Mining Tool<ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://text-mining-tool.com/">http://text-mining-tool.com/</ext-link>
                    </comment>
                    <pub-id pub-id-type="other">5sCQOQOmM</pub-id>
                </nlm-citation>
            </ref>
            <ref id="ref28">
                <label>28</label>
                <nlm-citation citation-type="web">
                    <source>Kryloff Technologies, Inc</source>
                    <year>2010</year>
                    <access-date>2010-08-23</access-date>
                    <comment>GetText<ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://www.kryltech.com/freestf.htm">http://www.kryltech.com/freestf.htm</ext-link>
                    </comment>
                    <pub-id pub-id-type="other">5sCQWKNpJ</pub-id>
                </nlm-citation>
            </ref>
            <ref id="ref29">
                <label>29</label>
                <nlm-citation citation-type="web">
                    <person-group person-group-type="author">
                        <name name-style="western">
                            <surname>van Noord</surname>
                            <given-names>G</given-names>
                        </name>
                    </person-group>
                    <source>Cavnar WB, Trenkle JM</source>
                    <access-date>2010-08-23</access-date>
                    <comment>TextCat<ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://www.let.rug.nl/~vannoord/TextCat/">http://www.let.rug.nl/~vannoord/TextCat/</ext-link>
                    </comment>
                    <pub-id pub-id-type="other">5sCQhnQNb</pub-id>
                </nlm-citation>
            </ref>
            <ref id="ref30">
                <label>30</label>
                <nlm-citation citation-type="journal">
                    <person-group person-group-type="author">
                        <name name-style="western">
                            <surname>El Emam</surname>
                            <given-names>K</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Jabbouri</surname>
                            <given-names>S</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Sams</surname>
                            <given-names>S</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Drouet</surname>
                            <given-names>Y</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Power</surname>
                            <given-names>M</given-names>
                        </name>
                    </person-group>
                    <article-title>Evaluating common de-identification heuristics for personal health information</article-title>
                    <source>J Med Internet Res</source>
                    <year>2006</year>
                    <volume>8</volume>
                    <issue>4</issue>
                    <fpage>e28</fpage>
                    <comment>
                        <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://www.jmir.org/2006/4/e28/" />
                    </comment>
                    <pub-id pub-id-type="doi">10.2196/jmir.8.4.e28</pub-id>
                    <pub-id pub-id-type="medline">17213047</pub-id>
                    <pub-id pub-id-type="pii">v8i4e28</pub-id>
                    <pub-id pub-id-type="pmcid">PMC1794009</pub-id>
                </nlm-citation>
            </ref>
            <ref id="ref31">
                <label>31</label>
                <nlm-citation citation-type="web">
                    <person-group person-group-type="author">
                        <collab>Judges' Technology Advisory Committee</collab>
                    </person-group>
                    <source>Canadian Judicial Council</source>
                    <year>2005</year>
                    <month>03</month>
                    <access-date>2012-06-05</access-date>
                    <comment>Use of Personal Information in Judgments and Recommended Protocol<ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://cjc-ccm.gc.ca/cmslib/general/news_pub_techissues_UseProtocol_2005_en.pdf">http://cjc-ccm.gc.ca/cmslib/general/news_pub_techissues_UseProtocol_2005_en.pdf</ext-link>
                    </comment>
                    <pub-id pub-id-type="other">68CIEk1BH</pub-id>
                </nlm-citation>
            </ref>
            <ref id="ref32">
                <label>32</label>
                <nlm-citation citation-type="web">
                    <person-group person-group-type="author">
                        <collab>World Health Organization</collab>
                    </person-group>
                    <source>US Centers for Disease Control and Prevention</source>
                    <year>2009</year>
                    <access-date>2010-08-23</access-date>
                    <comment>International Classification of Diseases, Ninth Revision (ICD-9).  Sep 1<ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://www.cdc.gov/nchs/icd/icd9.htm">http://www.cdc.gov/nchs/icd/icd9.htm</ext-link>
                    </comment>
                    <pub-id pub-id-type="other">5sCQnajgl</pub-id>
                </nlm-citation>
            </ref>
            <ref id="ref33">
                <label>33</label>
                <nlm-citation citation-type="web">
                    <person-group person-group-type="author">
                        <collab>MedDRA MSSO</collab>
                    </person-group>
                    <source>Northrop Grumman Corporation</source>
                    <year>2010</year>
                    <month>07</month>
                    <day>29</day>
                    <access-date>2010-08-23</access-date>
                    <comment>Medical Dictionary for Regulatory Activities<ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://www.meddramsso.com/index.asp">http://www.meddramsso.com/index.asp</ext-link>
                    </comment>
                    <pub-id pub-id-type="other">5sCR0vi82</pub-id>
                </nlm-citation>
            </ref>
            <ref id="ref34">
                <label>34</label>
                <nlm-citation citation-type="web">
                    <source>Health Canada</source>
                    <year>2010</year>
                    <month>04</month>
                    <day>1</day>
                    <access-date>2010-08-23</access-date>
                    <comment>Drug Product Database<ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://www.hc-sc.gc.ca/dhp-mps/prodpharma/databasdon/">http://www.hc-sc.gc.ca/dhp-mps/prodpharma/databasdon/</ext-link>
                    </comment>
                    <pub-id pub-id-type="other">5sCRCKIZv</pub-id>
                </nlm-citation>
            </ref>
            <ref id="ref35">
                <label>35</label>
                <nlm-citation citation-type="web">
                    <source>Merck Sharp &#38; Dohme Corp</source>
                    <year>2008</year>
                    <month>07</month>
                    <access-date>2010-08-23</access-date>
                    <comment>Trade Names of Some Commonly Used Drugs<ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://www.merck.com/mmpe/appendixes/ap2/ap2a.html">http://www.merck.com/mmpe/appendixes/ap2/ap2a.html</ext-link>
                    </comment>
                    <pub-id pub-id-type="other">5sCRGueLP</pub-id>
                </nlm-citation>
            </ref>
            <ref id="ref36">
                <label>36</label>
                <nlm-citation citation-type="book">
                    <person-group person-group-type="author">
                        <collab>Anonymous</collab>
                    </person-group>
                    <source>Webster&#039;s New World Medical Dictionary</source>
                    <year>2003</year>
                    <publisher-loc>New York, NY</publisher-loc>
                    <publisher-name>Wiley</publisher-name>
                </nlm-citation>
            </ref>
            <ref id="ref37">
                <label>37</label>
                <nlm-citation citation-type="web">
                    <source>Discordia Ltd</source>
                    <year>2010</year>
                    <access-date>2010-08-23</access-date>
                    <comment>Shareaza<ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://www.shareaza.com/">http://www.shareaza.com/</ext-link>
                    </comment>
                    <pub-id pub-id-type="other">5sCPqEOcd</pub-id>
                </nlm-citation>
            </ref>
            <ref id="ref38">
                <label>38</label>
                <nlm-citation citation-type="book">
                    <person-group person-group-type="author">
                        <name name-style="western">
                            <surname>Christman</surname>
                            <given-names>M</given-names>
                        </name>
                    </person-group>
                    <article-title>Sampling of rare populations</article-title>
                    <source>Pfeffermann D, Rao CR, editors. Handbook of Statistics 29A: Sample Surveys: Design, Methods and Applications</source>
                    <year>2009</year>
                    <publisher-loc>New York, NY</publisher-loc>
                    <publisher-name>Elsevier</publisher-name>
                </nlm-citation>
            </ref>
            <ref id="ref39">
                <label>39</label>
                <nlm-citation citation-type="journal">
                    <person-group person-group-type="author">
                        <name name-style="western">
                            <surname>Venette</surname>
                            <given-names>RC</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Moon</surname>
                            <given-names>RD</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Hutchison</surname>
                            <given-names>WD</given-names>
                        </name>
                    </person-group>
                    <article-title>Strategies and statistics of sampling for rare individuals</article-title>
                    <source>Annu Rev Entomol</source>
                    <year>2002</year>
                    <volume>47</volume>
                    <fpage>143</fpage>
                    <lpage>74</lpage>
                    <pub-id pub-id-type="doi">10.1146/annurev.ento.47.091201.145147</pub-id>
                    <pub-id pub-id-type="medline">11729072</pub-id>
                    <pub-id pub-id-type="pii">47/1/143</pub-id>
                </nlm-citation>
            </ref>
            <ref id="ref40">
                <label>40</label>
                <nlm-citation citation-type="confproc">
                    <person-group person-group-type="author">
                        <name name-style="western">
                            <surname>Chen</surname>
                            <given-names>K</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Shen</surname>
                            <given-names>H</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Zhang</surname>
                            <given-names>H</given-names>
                        </name>
                    </person-group>
                    <article-title>Leveraging social networks for P2P content-based files sharing in mobile ad hoc networks</article-title>
                    <source>Proceedings</source>
                    <year>2011</year>
                    <conf-name>2011 IEEE 8th International Conference on Mobile Adhoc and Sensor Systems</conf-name>
                    <conf-date>Oct 17-22, 2011</conf-date>
                    <conf-loc>Valencia, Spain</conf-loc>
                    <pub-id pub-id-type="doi">10.1109/MASS.2011.24</pub-id>
                </nlm-citation>
            </ref>
            <ref id="ref41">
                <label>41</label>
                <nlm-citation citation-type="journal">
                    <person-group person-group-type="author">
                        <name name-style="western">
                            <surname>Sokolova</surname>
                            <given-names>M</given-names>
                        </name>
                        <name name-style="western">
                            <surname>El Emam</surname>
                            <given-names>K</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Chowdhury</surname>
                            <given-names>S</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Neri</surname>
                            <given-names>E</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Rose</surname>
                            <given-names>S</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Jonker</surname>
                            <given-names>E</given-names>
                        </name>
                    </person-group>
                    <article-title>Evaluation of rare event detection</article-title>
                    <source>Lect Notes Comput Sci</source>
                    <year>2010</year>
                    <volume>6085</volume>
                    <fpage>379</fpage>
                    <lpage>83</lpage>
                    <pub-id pub-id-type="doi">10.1007/978-3-642-13059-5_51</pub-id>
                </nlm-citation>
            </ref>
        </ref-list>
    </back>
</article>
