<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article" dtd-version="2.0">
    <front>
        <journal-meta>
            <journal-id journal-id-type="publisher-id">JMIR</journal-id>
            <journal-id journal-id-type="nlm-ta">J Med Internet Res</journal-id>
            <journal-title>Journal of Medical Internet Research</journal-title>
            <issn pub-type="epub">1438-8871</issn>
            <publisher>
                <publisher-name>Gunther Eysenbach</publisher-name>
                <publisher-loc>JMIR Publications Inc., Toronto, Canada</publisher-loc>
            </publisher>
        </journal-meta>
        <article-meta>
            <article-id pub-id-type="publisher-id">v13i4e98</article-id>
            <article-id pub-id-type="pmid">22112583</article-id>
            <article-id pub-id-type="doi">10.2196/jmir.1799</article-id>
            <article-categories>
                <subj-group subj-group-type="article-type">
                    <subject>Original Paper</subject>
                </subj-group>
            </article-categories>
            <title-group>
                <article-title>Using Natural Language Processing to Enable In-depth Analysis of Clinical Messages Posted to an Internet Mailing List: A Feasibility Study</article-title>
            </title-group>
            <contrib-group>
                <contrib contrib-type="editor">
                    <name>
                        <surname>Eysenbach</surname>
                        <given-names>Gunther</given-names>
                    </name>
                </contrib>
            </contrib-group>
            <contrib-group>
                <contrib contrib-type="reviewer">
                    <name>
                        <surname>Archaya</surname>
                        <given-names>Amit</given-names>
                    </name>
                </contrib>
                <contrib contrib-type="reviewer">
                    <name>
                        <surname>Zeng</surname>
                        <given-names>Qing</given-names>
                    </name>
                </contrib>
                <contrib contrib-type="reviewer">
                    <name>
                        <surname>Keselman</surname>
                        <given-names>Alla</given-names>
                    </name>
                </contrib>
            </contrib-group>
            <contrib-group>
                <contrib contrib-type="author" id="contrib1" corresp="yes">
                    <name name-style="western">
                        <surname>Bekhuis</surname>
                        <given-names>Tanja</given-names>
                    </name>
                    <degrees>PhD, MS, MLIS</degrees>
                    <xref ref-type="aff" rid="aff1">1</xref>
                    <address>
                        <institution>Department of Biomedical Informatics</institution>
                        <institution>School of Medicine</institution>
                        <institution>University of Pittsburgh</institution>
                        <addr-line>UPMC Cancer Pavilion, Suite 301-338</addr-line>
                        <addr-line>5150 Centre Avenue</addr-line>
                        <addr-line>Pittsburgh, PA, 15232</addr-line>
                        <country>United States</country>
                        <phone>1 412 647 6705</phone>
                        <fax>1 412 623 4737</fax>
                        <email>tcb24@pitt.edu</email>
                    </address>
                    <xref ref-type="aff" rid="aff2">2</xref>
                </contrib>
                <contrib contrib-type="author" id="contrib2">
                    <name name-style="western">
                        <surname>Kreinacke</surname>
                        <given-names>Marcos</given-names>
                    </name>
                    <degrees>Dipl-Math</degrees>
                    <xref ref-type="aff" rid="aff3">3</xref>
                </contrib>
                <contrib contrib-type="author" id="contrib3">
                    <name name-style="western">
                        <surname>Spallek</surname>
                        <given-names>Heiko</given-names>
                    </name>
                    <degrees>PhD, DMD, MSBA(CIS)</degrees>
                    <xref ref-type="aff" rid="aff2">2</xref>
                    <xref ref-type="aff" rid="aff4">4</xref>
                </contrib>
                <contrib contrib-type="author" id="contrib4">
                    <name name-style="western">
                        <surname>Song</surname>
                        <given-names>Mei</given-names>
                    </name>
                    <degrees>PhD</degrees>
                    <xref ref-type="aff" rid="aff2">2</xref>
                </contrib>
                <contrib contrib-type="author" id="contrib5">
                    <name name-style="western">
                        <surname>O'Donnell</surname>
                        <given-names>Jean A</given-names>
                    </name>
                    <degrees>DMD, MSN</degrees>
                    <xref ref-type="aff" rid="aff5">5</xref>
                </contrib>
            </contrib-group>
            <aff id="aff1" rid="aff1">
                <sup>1</sup>
                <institution>Department of Biomedical Informatics</institution>
                <institution>School of Medicine</institution>
                <institution>University of Pittsburgh</institution>
                <addr-line>Pittsburgh, PA</addr-line>
                <country>United States</country>
            </aff>
            <aff id="aff2" rid="aff2">
                <sup>2</sup>
                <institution>Center for Dental Informatics</institution>
                <institution>Department of Dental Public Health, School of Dental Medicine</institution>
                <institution>University of Pittsburgh</institution>
                <addr-line>Pittsburgh, PA</addr-line>
                <country>United States</country>
            </aff>
            <aff id="aff3" rid="aff3">
                <sup>3</sup>
                <institution>Institute for Business Taxation</institution>
                <institution>Leibniz University of Hanover</institution>
                <addr-line>Hanover</addr-line>
                <country>Germany</country>
            </aff>
            <aff id="aff4" rid="aff4">
                <sup>4</sup>
                <institution>Office of Faculty Development and Information Management</institution>
                <institution>School of Dental Medicine</institution>
                <institution>University of Pittsburgh</institution>
                <addr-line>Pittsburgh, PA</addr-line>
                <country>United States</country>
            </aff>
            <aff id="aff5" rid="aff5">
                <sup>5</sup>
                <institution>Office of Education and Curriculum</institution>
                <institution>School of Dental Medicine</institution>
                <institution>University of Pittsburgh</institution>
                <addr-line>Pittsburgh, PA</addr-line>
                <country>United States</country>
            </aff>
            <pub-date pub-type="collection">
                <season>Oct-Dec</season>
                <year>2011</year>
            </pub-date>
            <pub-date pub-type="epub">
                <day>23</day>
                <month>11</month>
                <year>2011</year>
            </pub-date>
            <volume>13</volume>
            <issue>4</issue>
            <elocation-id>e98</elocation-id>
            <!--history from ojs - api-xml-->
            <history>
                <date date-type="received">
                    <day>10</day>
                    <month>03</month>
                    <year>2011</year>
                </date>
                <date date-type="rev-request">
                    <day>19</day>
                    <month>04</month>
                    <year>2011</year>
                </date>
                <date date-type="rev-recd">
                    <day>24</day>
                    <month>05</month>
                    <year>2011</year>
                </date>
                <date date-type="accepted">
                    <day>08</day>
                    <month>07</month>
                    <year>2011</year>
                </date>
            </history>
            <!--(c) the authors - correct author names and publication date here if necessary. Date in form ', dd.mm.yyyy' after jmir.org-->
            <copyright-statement>&#169;Tanja Bekhuis, Marcos Kreinacke, Heiko Spallek, Mei Song, Jean A O'Donnell. Originally published in the Journal of Medical Internet Research (http://www.jmir.org), 23.11.2011. </copyright-statement>
            <copyright-year>2011</copyright-year>
            <license license-type="open-access" xlink:href="http://creativecommons.org/licenses/by/2.0/">
                <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (http://creativecommons.org/licenses/by/2.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in the Journal of Medical Internet Research, is properly cited. The complete bibliographic information, a link to the original publication on http://www.jmir.org/, as well as this copyright and license information must be included.</p>
            </license>
            <self-uri xlink:href="http://www.jmir.org/2011/4/e98/" xlink:type="simple" />
            <abstract>
                <sec sec-type="background">
                    <title>Background</title>
                    <p>An Internet mailing list may be characterized as a virtual community of practice that serves as an information hub with easy access to expert advice and opportunities for social networking. We are interested in mining messages posted to a list for dental practitioners to identify clinical topics. Once we understand the topical domain, we can study dentists&#8217; real information needs and the nature of their shared expertise, and can avoid delivering useless content at the point of care in future informatics applications. However, a necessary first step involves developing procedures to identify messages that are worth studying given our resources for planned, labor-intensive research.</p>
                </sec>
                <sec sec-type="objectives">
                    <title>Objectives</title>
                    <p>The primary objective of this study was to develop a workflow for finding a manageable number of clinically relevant messages from a much larger corpus of messages posted to an Internet mailing list, and to demonstrate the potential usefulness of our procedures for investigators by retrieving a set of messages tailored to the research question of a qualitative research team.</p>
                </sec>
                <sec sec-type="methods">
                    <title>Methods</title>
                    <p>We mined 14,576 messages posted to an Internet mailing list from April 2008 to May 2009. The list has about 450 subscribers, mostly dentists from North America interested in clinical practice. After extensive preprocessing, we used the Natural Language Toolkit to identify clinical phrases and keywords in the messages. Two academic dentists classified collocated phrases in an iterative, consensus-based process to describe the topics discussed by dental practitioners who subscribe to the list. We then consulted with qualitative researchers regarding their research question to develop a plan for targeted retrieval. We used selected phrases and keywords as search strings to identify clinically relevant messages and delivered the messages in a reusable database.</p>
                </sec>
                <sec sec-type="results">
                    <title>Results</title>
                    <p>About half of the subscribers (245/450, 54.4%) posted messages. Natural language processing (NLP) yielded 279,193 clinically relevant tokens or processed words (19% of all tokens). Of these, 2.02% (5634 unique tokens) represent the vocabulary for dental practitioners. Based on pointwise mutual information score and clinical relevance, 325 collocated phrases (eg, <italic>fistula filled obturation</italic> and <italic>herpes zoster</italic>) with 108 keywords (eg, <italic>mercury</italic>) were classified into 13 broad categories with subcategories. In the demonstration, we identified 305 relevant messages (2.1% of all messages) over 10 selected categories with instances of collocated phrases, and 299 messages (2.1%) with instances of phrases or keywords for the category <italic>systemic disease.</italic>
                    </p>
                </sec>
                <sec sec-type="conclusions">
                    <title>Conclusions</title>
                    <p>A workflow with a sequence of machine-based steps and human classification of NLP-discovered phrases can support researchers who need to identify relevant messages in a much larger corpus. Discovered phrases and keywords are useful search strings to aid targeted retrieval. We demonstrate the potential value of our procedures for qualitative researchers by retrieving a manageable set of messages concerning systemic and oral disease.</p>
                </sec>
            </abstract>
            <kwd-group>
                <kwd>Dentistry</kwd>
                <kwd>dental informatics</kwd>
                <kwd>clinical research informatics</kwd>
                <kwd>natural language processing</kwd>
                <kwd>information storage and retrieval</kwd>
                <kwd>electronic mail</kwd>
                <kwd>information-seeking behavior</kwd>
            </kwd-group>
        </article-meta>
    </front>
    <body>
        <sec sec-type="introduction">
            <title>Introduction</title>
            <p>In the United States, about 70% of dentists work in relative isolation as solo practitioners or in small groups [<xref ref-type="bibr" rid="ref1">1</xref>]. Unfortunately, independent practitioners cannot afford to subscribe to all of the information resources readily available to dental faculty, academic researchers, and clinicians in large organizations. For example, the University of Pittsburgh&#8217;s Health Sciences Library System [<xref ref-type="bibr" rid="ref2">2</xref>] also serves UPMC, a global health enterprise. Dentists affiliated with either of these organizations have access to more than 3800 books on general dentistry, endodontics, pediatrics, periodontics, restoration, and special care; 15 full-text electronic books on dentistry, including important core resources; and more than 75 dentistry journals, most of which are available electronically.</p>
            <p>In contrast, independent practitioners typically meet their information needs by relying on colleagues, discussion lists, news outlets, and a few professional journals to which they subscribe [<xref ref-type="bibr" rid="ref3">3</xref>]. Even though most dentists in the United States have access to the American Dental Association&#8217;s library by virtue of their membership, retrieval of more than the occasional full text is expensive. For example, if a member finds information in PubMed [<xref ref-type="bibr" rid="ref4">4</xref>] not freely available in PubMed Central or an open source journal, the fee for retrieval and delivery by the library is US $7 to US $15 per article, and US $15 for one or two books, with possible late charges [<xref ref-type="bibr" rid="ref5">5</xref>]. Fees are higher for nonmembers.</p>
            <p>Thus, we conclude that the full panoply of important resources is inaccessible to most dentists when questions arise regarding best practice, especially at the point of care when readily available information is needed. This fact combined with dentists&#8217; preference for first consulting peers means that online communities are potentially valuable sources of information [<xref ref-type="bibr" rid="ref6">6</xref>-<xref ref-type="bibr" rid="ref8">8</xref>]. Such communities could be used in the future as conduits for delivery of evidence-based information, such as updated guidelines for clinical care. As for delivery of information at the point of care, this urgent need demands informatics solutions and is the focus of a US federally funded project led by Dr. Heiko Spallek [<xref ref-type="bibr" rid="ref9">9</xref>].</p>
            <sec>
                <title>Communities of Practice</title>
                <p>An online or e-community is sometimes characterized as a virtual community of practice (CoP<italic>)</italic> [<xref ref-type="bibr" rid="ref10">10</xref>] because members are geographically isolated yet connected socially via the Internet. A virtual CoP can serve as an information hub with easy access to expert advice and opportunities for social networking (eg, see [<xref ref-type="bibr" rid="ref11">11</xref>]). The rationale for considering the opinions of peers expressed online is similar in spirit to the way in which research is initiated by practitioner-investigators in practice-based research networks [<xref ref-type="bibr" rid="ref1">1</xref>,<xref ref-type="bibr" rid="ref12">12</xref>,<xref ref-type="bibr" rid="ref13">13</xref>]. In both cases, the value of clinical experience is recognized.</p>
                <p>For our purposes, we are interested in knowing which clinical topics are discussed by dentists in a CoP. To do this, we mine their asynchronous messages posted to an enduring and active online discussion list. Once we understand the topics covered in the corpus of messages, we can study dentists&#8217; real information needs and the nature of their shared expertise, and can avoid delivering useless content to the community or at the point of care in future informatics applications.</p>
            </sec>
            <sec>
                <title>Assisting Qualitative Researchers</title>
                <p>To plan a labor-intensive study of information needs with its in-depth content analyses of clinical topics and emergent themes, one must carefully consider available human resources. For example, we have two academic dental researchers who can devote just a few days to coding and interpreting thematic content of messages with guidance from an experienced qualitative researcher. The problem then is how to assist qualitative researchers by finding a manageable number of clinically relevant messages that are worth studying given available resources.</p>
                <p>If we know the typical length of messages, the time it takes to code a message regarding clinical topics and themes, and the number of hours researchers can devote to the content analyses, we can estimate the sample size (n) that will ensure the feasibility of the planned content analyses. Here, the corpus consisted of thousands of messages posted to an Internet mailing list for practicing dental professionals, primarily general dentists. We assumed that two academic dentists and one qualitative researcher could manage a few hundred messages.</p>
                <p>In general, we considered three options for drawing the sample: (1) randomly sample n messages from the corpus, (2) restrict the interval of time in which n messages occur and select all messages within that interval, and (3) use natural language processing (NLP) to identify clinical topics and, depending on the research question, retrieve n messages with useful content.</p>
                <p>The advantage of the first two options is that they are well known and easy to implement. A major disadvantage is that the selected messages may be irrelevant to the researchers&#8217; interests, especially given the informal quality of messages posted online. In the present study, the purpose of the mailing list from which the corpus originated is to offer dentists a place to discuss their clinical concerns. However, many of the messages were off topic. For example, dentists chatted about the big football game, the trip to Europe, the swimsuit issue of <italic>Sports Illustrated</italic>, Michael Jackson&#8217;s death, and aging parents. Although the third option is novel and more time consuming than the first two, it is in keeping with the notion that the nature of the corpus needs to be understood <italic>before</italic> messages are selected. This is because inferences depend on the selected units of analysis such as blocks of text [<xref ref-type="bibr" rid="ref14">14</xref>]. Thus, the third option ensures the feasibility and probably the quality of content analyses by identifying a manageable number of messages relevant to the research question.</p>
                <p>In this paper, we present a workflow for identifying and retrieving a manageable subset of relevant messages from a much larger corpus. It involves a sequence of machine-based steps along with human classification of clinical phrases discovered with NLP. We also demonstrate the value of this approach for enabling study of text messages by qualitative researchers. As an example, we describe the strategy we used to retrieve messages for a study underway that involves in-depth content analyses.</p>
                <p>A preliminary version of this paper was presented at the 2010 Annual Symposium of the American Medical Informatics Association [<xref ref-type="bibr" rid="ref15">15</xref>].</p>
            </sec>
        </sec>
        <sec sec-type="methods">
            <title>Methods</title>
            <p>We mined the clinical content of 14,576 electronic messages posted to a fee-based discussion list during an approximate 1-year study period from April 18, 2008 to May 28, 2009. The subscribers to this global list are dental practitioners, mostly dentists from North America interested in clinical practice.</p>
            <sec>
                <title>Deidentification</title>
                <p>Because the origin of our corpus of messages is a private Internet mailing list, we took care to preserve confidentiality even though (1) the University of Pittsburgh Institutional Review Board approved this study as being exempt (PRO08040313), (2) the owner of the list deleted identifying information from the message headers before sharing content, (3) messages are regularly delivered to about 450 subscribers and then saved in a searchable archive, and (4) anyone interested in clinical dental care can subscribe. The number of subscribers and the ease with which one can subscribe suggest that this mailing list has a public aspect. Nevertheless, we went through several rounds of deidentification for two reasons: (1) to ensure confidentiality [<xref ref-type="bibr" rid="ref16">16</xref>] for future data sharing, and (2) to optimize NLP by stripping out irrelevant information. We also used Google to confirm that excerpts presented in this paper are not easily retrievable.</p>
                <p>During NLP (see below), we deleted stopwords (eg, articles and prepositions) to optimize discovery of topical content. Surprisingly, deletion of stopwords may help preserve anonymity. This idea is based on knowing that forensic researchers use stylistic properties of messages, including number and distribution of function or stopwords, to identify authors of email [<xref ref-type="bibr" rid="ref17">17</xref>]. We also deleted any remaining names and places by using lists and a gazetteer, respectively, available in the Natural Language Toolkit (NLTK) [<xref ref-type="bibr" rid="ref18">18</xref>].</p>
            </sec>
            <sec>
                <title>Preprocessing</title>
                <p>Mining email is challenging because of the nature of the messages [<xref ref-type="bibr" rid="ref19">19</xref>]. For example, email can be ill formed linguistically with spelling and grammatical errors, and style can be idiosyncratic [<xref ref-type="bibr" rid="ref17">17</xref>]. Typically, email is particularly noisy in that much of the data are irrelevant to the research question. For these reasons, processing messages is essential before clinical topics can be discovered.</p>
                <p>Initially, we extracted the body of each message and deleted threaded responses, which is appropriate given our interest in discovering clinical topics rather than analyzing discourse. To clean the data further, we analyzed message patterns to identify recurring sources of noise (ie, data that obscure message content and meaningful frequencies in the original texts)<italic>.</italic> Consequently, we deleted forwarded and quoted messages; embedded visual data such as x-ray images and photographs; virus- or spam-free notices; Microsoft Outlook notices; advertisements and footers; and signature lines. The latter often include self-promotional text.</p>
            </sec>
            <sec>
                <title>Natural Language Processing</title>
                <p>We used the open source NLTK version 2.0 with Python version 2.6 (Python Software Foundation, Wolfeboro Falls, NH, USA) to analyze preprocessed text. For readers new to NLP, the textbook <italic>Natural Language Processing with Python</italic> is a useful resource [<xref ref-type="bibr" rid="ref18">18</xref>]. At the NLTK website [<xref ref-type="bibr" rid="ref20">20</xref>], one can access the textbook, as well as download the programming language Python, optional packages, and the NLTK modules for NLP and text analytics.</p>
                <p>Note that in this section we italicize terms that may be unfamiliar to readers.</p>
                <p>We sorted and concatenated the messages by date to enable tracking discussion of topics over time. We also converted to lower case and selected <italic>alphabetic token</italic>s (processed words or strings of letter characters) with length &#62;3 characters. We deleted English <italic>stopwords</italic> (short function words such as &#8220;a&#8221; and &#8220;the&#8221;), as well as names and places. We explored the usefulness of the obtained <italic>vocabulary</italic> (set of unique tokens), as well as <italic>bigrams</italic> and <italic>trigrams</italic> (pairs and triples of contiguous processed words) by examining the 100 and 300 most frequent tokens and <italic>n-grams</italic> (bigrams and trigrams)<italic>.</italic> However, these were deemed clinically uninteresting.</p>
                <p>To find clinical <italic>content-bearing tokens</italic> (substantive words such as apolipoprotein and stenosis) and phrases, we selected tokens with length &#62;5 and frequency &#62;7, and then derived n-grams. The rationale for this filter is similar to one presented in the NLTK text [<xref ref-type="bibr" rid="ref18">18</xref>] where the goal is to find words and phrases that characterize a <italic>genre</italic>. Here the genre is <italic>email with a clinical focus written by dental practitioners</italic>. We also created <italic>collocated n</italic>
                    <italic>-</italic>
                    <italic>grams</italic>. <italic>Collocations</italic> are contiguous tokens that occur together more often than one would expect if the tokens were probabilistically independent. We selected the top 600 collocated bigrams and trigrams (300 for each type) by computing the pointwise mutual information measure for each n-gram and then sorting.</p>
                <p>We informally confirmed that collocations derived from the content-bearing tokens were likely to retrieve useful messages by constructing <italic>concordances</italic> for selected tokens. A <italic>concordance</italic> is a set of retrieved lines with windows of text around a token or target word. The windows allow one to explore the contexts in which a target word occurs in the corpus. To build a concordance using the NLTK [<xref ref-type="bibr" rid="ref20">20</xref>], one specifies the window size or number of characters per line, as well as the number of lines to display. For example, we examined the concordance for <italic>lesion</italic> to preview message content. Here are two samples from its concordance:</p>
                <disp-quote>
                    <p>...[t]his is almost always seen in younger patients. I&#8217;m betting this lesion is of endodontic origin. Tough case to diagnose with certainty...</p>
                </disp-quote>
                <disp-quote>
                    <p>...they&#8217;re looking for cancer. They will NOT understand that if a lesion looks like cancer the Brush Test is not indicated. If you see a...</p>
                </disp-quote>
            </sec>
            <sec>
                <title>Classification of Phrases and Selection of Keywords</title>
                <p>Although most of the collocations seem to characterize dentists&#8217; clinical language, some are irrelevant. For example, here is a sample of collocations with irrelevant trigrams in italics: molecular bacterial antigens, <italic>committing stating profitable</italic>, <italic>perspective agreement lobbyists</italic>, methotrexate causative factor, inhibits demineralization enamel, <italic>driving cadillac attack,</italic> mutans streptococci presence.</p>
                <p>Thus, two academic dentists (HS, JO) selected a subset of relevant collocated phrases, including bigrams and trigrams that could be used as search strings to retrieve messages with clinical content. Note that some n-grams overlap. By retaining overlapping n-grams, if they exist, we ensure a broader search than if we use just trigrams. (Most overlapping n-grams point to the same messages, but not always.) An example of an overlapping pair of n-grams is <italic>prescribed amoxicillin hydrocodone</italic> and <italic>amoxicillin hydrocodone</italic>.</p>
                <p>The dentists also classified the phrases they selected by sorting them into broad categories with subcategories; this is considered an inductive approach to classification. Then they labeled the categories and subcategories. The process for both selection and classification was an iterative one involving discussion to reach consensus. The emergent classification scheme describes the clinical topics of concern to the dental practitioners who posted to the online mailing list. It likely will be useful to the qualitative researchers when they code messages for later content analyses [<xref ref-type="bibr" rid="ref21">21</xref>].</p>
                <p>After the phrases were classified, we identified embedded keywords (unigrams) to ensure that retrieval could be even broader, if desired. We defined a keyword as one that occurs at least twice in the full set of collocations. Each variant or closely related word counts as an occurrence. For example, <italic>plaque</italic> and <italic>plaques,</italic> as well as <italic>atherosclerosis</italic> and <italic>atherosclerotic</italic>, are variants; <italic>cardiac</italic> and <italic>myocardial</italic> are closely related. All six italicized examples can be used as search strings to find messages.</p>
            </sec>
            <sec>
                <title>Finding Relevant Messages: A Demonstration</title>
                <p>To demonstrate how the workflow presented in this paper can help researchers (see <xref ref-type="fig" rid="figure1">Figure 1</xref>), consider the following scenario. In our research center, a qualitative study investigating the information needs of dentists regarding the relationship between systemic disease and oral health is underway. Given this focus, two researchers independently selected some of the NLP-discovered phrases that we had identified and classified in this study. They reached consensus by discussion to determine the final list of phrases. Thus, they found a subset of phrases with embedded keywords in a subset of categories. We used the selected phrases and keywords as search strings to find messages relevant to their research question.</p>
                <p>Because the content-bearing phrases were discovered in a merged file that had been considerably processed, a question arose as to what should be the maximum number of allowable characters between words in a phrase when searching cleaned messages not yet processed with NLP. In an informal assessment, we used 20 phrases across categories as search strings and found that the number of characters between any two words in a phrase ranged from 1 to 78. As a conservative estimate, we therefore chose to limit the interval to at most 100 characters. The aptness of this choice was borne out by the results (see below). Briefly, we carried out the following steps to retrieve and organize messages:</p>
                <list list-type="order">
                    <list-item>
                        <p>Create search strings based on collocations by first splitting phrases into words. Then for each phrase, recombine the words in any order with at most 100 characters between words. (We ignored order because words in discovered phrases were sometimes reordered in the messages, eg, <italic>mutans streptococci</italic> versus <italic>streptococci mutans</italic>.)</p>
                    </list-item>
                    <list-item>
                        <p>Use each keyword as a search string. If a keyword appears adjacent to another keyword in a phrase, preserve the order and search for the concatenated string.</p>
                    </list-item>
                    <list-item>
                        <p>Match the search strings to cleaned message texts; retrieve messages with at least one matching string.</p>
                    </list-item>
                    <list-item>
                        <p>Sort messages into folders (directories) per category, as well as into folders by type of match (phrase or keyword). (For example, messages with at least one phrase from the category <italic>systemic disease</italic> were sorted into a folder for that category, as well as a folder for all messages with instances of clinically relevant phrases. Similarly, messages with at least one keyword match were sorted into corresponding folders.)</p>
                    </list-item>
                    <list-item>
                        <p>Deliver deduplicated messages in folders to the researchers. (This sorting helps them find the messages they want to analyze. Further, filenames include the date when the message was posted plus a unique database identifier, which allows tracking of change in topical discussion over time, as well as retrieval of particular messages.)</p>
                    </list-item>
                </list>
                <p>For illustration purposes, consider the excerpted messages below that can be retrieved by using the following as search strings: (1) <italic>fistula filled obturation</italic> [trigram], (2) <italic>herpes zoster</italic> [bigram], and (3) <italic>mercury</italic> [keyword]<italic>.</italic> Remember that a maximum of 100 characters is allowed between the italicized words:</p>
                <list list-type="order">
                    <list-item>
                        <p>
                            <italic>...If you have a tooth with an actively draining</italic> fistula <italic>(pus</italic> filled <italic>canal), do you do one visit endo if you can get a dry canal before</italic> obturation<italic>? Or do you medicate for some time period and fill at a later date?...</italic>
                        </p>
                    </list-item>
                    <list-item>
                        <p>
                            <italic>...patient [with] recurrent ulcers on his palate [that] follow the distribution of the greater palatine nerve... I suspect</italic> herpes zoster.<italic> Most of the time I&#8217;ve seen this it&#8217;s been unilateral, but in his case it&#8217;s always bilateral. What other Dxs [diagnoses] should I be considering...</italic>
                        </p>
                    </list-item>
                    <list-item>
                        <p>
                            <italic>&#8230;Am I missing the point or is the issue (the real issue) with</italic> mercury <italic>not whether it causes systemic disease but rather the environmental issue of</italic> mercury <italic>in the food chain? We all (in the UK) have to have amalgam separators now but we know they&#8217;re not foolproof...</italic>
                        </p>
                    </list-item>
                </list>
                <fig id="figure1" position="float">
                    <label>Figure 1</label>
                    <caption>
                        <p>Workflow for finding clinically relevant messages posted to an Internet mailing list.</p>
                    </caption>
                    <graphic xlink:href="jmir_v13i4e98_fig1.jpg" alt-version="no" mimetype="image" position="float" xlink:type="simple" />
                </fig>
            </sec>
        </sec>
        <sec sec-type="results">
            <title>Results</title>
            <sec>
                <title>Subscriber Participation</title>
                <p>Just over half of the subscribers (245, or 54.4%) of the approximate total number of subscribers (N = 450) posted 14,576 messages. Of these, 21 subscribers (5% of the list) posted 7288 (50%) of the messages; 29 subscribers (6% of the list) posted 3644 (25%) of the messages; and 195 subscribers (43.3% of the list) posted the remaining 3644 (25%) of the messages (see <xref ref-type="fig" rid="figure2">Figure 2</xref>). Thus, 205 subscribers (45.6%) were passive (ie, they received messages but did not otherwise contribute to the message traffic during the study interval). Note that the total number of subscribers is approximate because the list size varies somewhat over time.</p>
                <fig id="figure2" position="float">
                    <label>Figure 2</label>
                    <caption>
                        <p>Cumulative distribution of messages posted by dental practitioners to an online discussion list.</p>
                    </caption>
                    <graphic xlink:href="jmir_v13i4e98_fig2.jpg" alt-version="no" mimetype="image" position="float" xlink:type="simple" />
                </fig>
            </sec>
            <sec>
                <title>Natural Language Processing</title>
                <p>The concatenated file of cleaned messages yielded 1,468,244 tokens. Initial NLP (selecting alphabetic tokens with length &#62;3, deleting names and places, etc) reduced the number of tokens to 533,251 (36.32%).</p>
                <p>Filtering to find clinical content-bearing tokens yielded 279,193 tokens (19.02%). For our purposes, the unique tokens in the content-bearing set (5634, or 2.02% of the content-bearing tokens) represent the dental practitioners&#8217; vocabulary. We obtained 208,026 bigrams and 252,931 trigrams, and derived collocations. For illustration purposes, we present a handful of collocated bigrams and trigrams: <italic>osteoclastic activity</italic>, <italic>painful sequestrum</italic>, and <italic>intravenous bisphosphonates</italic> (bigrams); <italic>glucose homeostasis inflammation</italic>, <italic>irreversible pulpitis apical</italic>, and <italic>supragingival scaling prophylaxis</italic> (trigrams).</p>
            </sec>
            <sec>
                <title>Classification of Phrases and Selection of Keywords</title>
                <p>The classification of phrases resulted in 13 broad categories with subcategories. <xref ref-type="table" rid="table1">Table 1</xref> presents the categories and distribution of collocated phrases and embedded keywords. The entire classification including categories and subcategories, 325 collocated phrases, and 108 embedded keywords is presented in <xref ref-type="app" rid="app1">Multimedia Appendix 1</xref>.</p>
                <table-wrap id="table1" position="float">
                    <label>Table 1</label>
                    <caption>
                        <p>Distribution of collocated phrases and keywords by category</p>
                    </caption>
                    <table cellpadding="8" cellspacing="0" border="1" rules="groups" frame="hsides" width="1000">
                        <col width="297" />
                        <col width="396" />
                        <col width="307" />
                        <thead>
                            <tr valign="bottom">
                                <td>Category</td>
                                <td>n of collocated phrases<sup>a</sup> (% of phrases)<sup>b</sup>
                                </td>
                                <td>n of keywords<sup>c</sup> (% of keywords)<sup>d</sup>
                                </td>
                            </tr>
                        </thead>
                        <tbody>
                            <tr valign="top">
                                <td>Systemic disease</td>
                                <td>49 (15)</td>
                                <td>21 (15)</td>
                            </tr>
                            <tr valign="top">
                                <td>Endodontics</td>
                                <td>18 (6)</td>
                                <td>9 (6)</td>
                            </tr>
                            <tr valign="top">
                                <td>Orthodontics</td>
                                <td>8 (3)</td>
                                <td>3 (2)</td>
                            </tr>
                            <tr valign="top">
                                <td>Periodontics</td>
                                <td>12 (4)</td>
                                <td>6 (4)</td>
                            </tr>
                            <tr valign="top">
                                <td>Restorative dentistry</td>
                                <td>66 (20)</td>
                                <td>20 (14)</td>
                            </tr>
                            <tr valign="top">
                                <td>Oral and maxillofacial surgery</td>
                                <td>26 (8)</td>
                                <td>18 (13)</td>
                            </tr>
                            <tr valign="top">
                                <td>Other oral diseases</td>
                                <td>7 (2)</td>
                                <td>4 (3)</td>
                            </tr>
                            <tr valign="top">
                                <td>Radiology</td>
                                <td>7 (2)</td>
                                <td>4 (3)</td>
                            </tr>
                            <tr valign="top">
                                <td>Causative agent</td>
                                <td>20 (6)</td>
                                <td>9 (6)</td>
                            </tr>
                            <tr valign="top">
                                <td>Medication</td>
                                <td>36 (11)</td>
                                <td>19 (13)</td>
                            </tr>
                            <tr valign="top">
                                <td>Materials</td>
                                <td>44 (14)</td>
                                <td>17 (12)</td>
                            </tr>
                            <tr valign="top">
                                <td>Basic sciences</td>
                                <td>13 ( 4)</td>
                                <td>6 (4)</td>
                            </tr>
                            <tr valign="top">
                                <td>Research</td>
                                <td>19 (6)</td>
                                <td>7 (5)</td>
                            </tr>
                            <tr valign="top">
                                <td>Total</td>
                                <td>325</td>
                                <td>143</td>
                            </tr>
                        </tbody>
                    </table>
                    <table-wrap-foot>
                        <fn id="table1fn1">
                            <p>
                                <sup>a</sup> Collocated phrases are bigrams and trigrams; selection based on pointwise mutual information score and clinical relevance.</p>
                        </fn>
                        <fn id="table1fn2">
                            <p>
                                <sup>b</sup> Percentage of phrases computed relative to the total number of phrases and rounded.</p>
                        </fn>
                        <fn id="table1fn3">
                            <p>
                                <sup>c</sup> Some keywords occur in more than one category. Thus, the total number of instances is greater than the number of unique keywords.</p>
                        </fn>
                        <fn id="table1fn4">
                            <p>
                                <sup>d</sup> Percentage of keywords computed relative to the total number of instances of keywords and rounded.</p>
                        </fn>
                    </table-wrap-foot>
                </table-wrap>
            </sec>
            <sec>
                <title>Finding Relevant Messages: A Demonstration</title>
                <p>Two academic dentists conducting a qualitative study selected a subset of phrases (n<sub>p</sub> = 144) with embedded keywords (n<sub>kw</sub> = 95) in 10 of 13 categories potentially related to their research question.</p>
                <p>Over k selected categories (k = 1 ... 10) and after deduplication, we retrieved 305 messages (range n<sub>k</sub>, 1&#8211;119 messages) with 520 instances of matching phrases; 948 messages (range n<sub>k</sub>, 12&#8211;343) with 1411 instances of matching keywords; and 996 messages (range n<sub>k</sub>, 12&#8211;363) with 1931 instances of matching phrases or keywords (see <xref ref-type="table" rid="table2">Table 2</xref>). The number of characters between words in a phrase ranged from 0 to 75, after deleting white spaces and punctuation.</p>
                <table-wrap id="table2" position="float">
                    <label>Table 2</label>
                    <caption>
                        <p>Number of messages with phrases or keywords retrieved for content analyses by selected category</p>
                    </caption>
                    <table cellpadding="8" cellspacing="0" border="1" rules="groups" frame="hsides" width="1000">
                        <col width="233" />
                        <col width="233" />
                        <col width="225" />
                        <col width="310" />
                        <thead>
                            <tr valign="bottom">
                                <td>Selected category<sup>a</sup>
                                </td>
                                <td>n of messages<sup>b</sup> (n of phrases)<sup>c</sup>
                                </td>
                                <td>n of messages (n of keywords)</td>
                                <td>n of messages (n of phrases or keywords)</td>
                            </tr>
                        </thead>
                        <tbody>
                            <tr valign="top">
                                <td>Systemic disease</td>
                                <td>119 (164)</td>
                                <td>284 (384)</td>
                                <td>299 (548)</td>
                            </tr>
                            <tr valign="top">
                                <td>Periodontics</td>
                                <td>14 (27)</td>
                                <td>51 (51)</td>
                                <td>54 (78)</td>
                            </tr>
                            <tr valign="top">
                                <td>Oral and maxillofacial surgery</td>
                                <td>36 (40)</td>
                                <td>106 (113)</td>
                                <td>106 (153)</td>
                            </tr>
                            <tr valign="top">
                                <td>Other oral diseases</td>
                                <td>17 (24)</td>
                                <td>44 (56)</td>
                                <td>48 (80)</td>
                            </tr>
                            <tr valign="top">
                                <td>Radiology</td>
                                <td>1 (1)</td>
                                <td>12 (12)</td>
                                <td>12 (13)</td>
                            </tr>
                            <tr valign="top">
                                <td>Causative agent</td>
                                <td>55 (78)</td>
                                <td>79 (95)</td>
                                <td>102 (173)</td>
                            </tr>
                            <tr valign="top">
                                <td>Medication</td>
                                <td>70 (110)</td>
                                <td>343 (377)</td>
                                <td>363 (487)</td>
                            </tr>
                            <tr valign="top">
                                <td>Materials</td>
                                <td>4 (4)</td>
                                <td>44 (50)</td>
                                <td>44 (54)</td>
                            </tr>
                            <tr valign="top">
                                <td>Basic sciences</td>
                                <td>8 (12)</td>
                                <td>157 (164)</td>
                                <td>160 (176)</td>
                            </tr>
                            <tr valign="top">
                                <td>Research</td>
                                <td>40 (60)</td>
                                <td>89 (109)</td>
                                <td>100 (169)</td>
                            </tr>
                            <tr valign="top">
                                <td>Total</td>
                                <td>305 (520)</td>
                                <td>948 (1411)</td>
                                <td>996 (1931)</td>
                            </tr>
                        </tbody>
                    </table>
                    <table-wrap-foot>
                        <fn id="table2fn1">
                            <p>
                                <sup>a</sup> Categories selected from the full set by qualitative researchers.</p>
                        </fn>
                        <fn id="table2fn2">
                            <p>
                                <sup>b</sup> Number of messages after deduplication.</p>
                        </fn>
                        <fn id="table2fn3">
                            <p>
                                <sup>c</sup> Collocated phrases are bigrams and trigrams; selection based on pointwise mutual information score and clinical relevance.</p>
                        </fn>
                    </table-wrap-foot>
                </table-wrap>
                <p>To interpret <xref ref-type="table" rid="table2">Table 2</xref>, consider the row for the category <italic>medication</italic>. In this category, we retrieved 70 messages with 110 matches for collocated phrases, such as <italic>intravenous bisphosphonates</italic> from the subcategory <italic>cancer drugs</italic> (see <xref ref-type="app" rid="app1">Multimedia Appendix 1</xref>). We also retrieved 343 messages with 377 matches for keywords, such as <italic>proinflammatory</italic> from the subcategory <italic>immune system</italic>. Finally, we retrieved 363 messages with 487 matches for phrases or keywords selected by the dentists in the category <italic>medication</italic>.</p>
            </sec>
        </sec>
        <sec sec-type="discussion">
            <title>Discussion</title>
            <sec>
                <title>Summary of Main Findings</title>
                <p>A workflow with a sequence of machine-based steps and human classification of NLP-discovered phrases can support researchers who need to identify relevant messages in a much larger corpus. NLP-discovered phrases and keywords are useful as search strings to aid targeted retrieval. We demonstrate the feasibility of our procedures for qualitative researchers by retrieving a manageable set of messages concerning systemic and oral disease.</p>
            </sec>
            <sec>
                <title>Surveys Versus Textual Analysis</title>
                <p>The reader might wonder, &#8220;Why bother with developing this workflow to support qualitative researchers? Why not survey the members of the virtual CoP and ask them outright about their information needs?&#8221;</p>
                <p>In the research literature, studies of information needs and barriers typically focus on clinicians and primary or ambulatory care settings. Of these, just a few studies consider dentists [<xref ref-type="bibr" rid="ref3">3</xref>,<xref ref-type="bibr" rid="ref8">8</xref>,<xref ref-type="bibr" rid="ref22">22</xref>,<xref ref-type="bibr" rid="ref23">23</xref>]. So far, most of what we know is derived from survey questionnaires with items in a forced-choice format. The use of other methods is less common (eg, see [<xref ref-type="bibr" rid="ref24">24</xref>]), even though relevant methods exist in commerce and public health. For example, marketing analysts of social media use text analytics to understand customer sentiment in unstructured text (see [<xref ref-type="bibr" rid="ref25">25</xref>] for an accessible introduction), and researchers in infodemiology are developing mixed methods for monitoring content posted to the Internet [<xref ref-type="bibr" rid="ref26">26</xref>,<xref ref-type="bibr" rid="ref27">27</xref>].</p>
                <p>Aside from the cost of developing sound surveys with appropriate sampling plans, a serious limitation is that respondents may not accurately remember the nature of their needs for evidence-based clinical information or the contexts in which needs arise. Interesting alternatives to surveys include analysis of cultural artifacts (eg, texts, images, or videos), face-to-face interviews, and field observation [<xref ref-type="bibr" rid="ref28">28</xref>].</p>
                <p>The investigators on our team whose project we used to demonstrate the feasibility of our procedures elected textual analysis as a way to understand clinical messages. For them, the corpus of messages posted by practicing dentists regarding specific patients or conditions is a rich data source. Appealing aspects of the corpus include the following: (1) information needs are contextually embedded, (2) messages are written in the &#8220;natural language&#8221; of dentists, and (3) discoverable clinical topics may not be what we would find with a questionnaire.</p>
                <p>Another reason for our team&#8217;s interest in textual analysis is that findings from a qualitative study can be compared with those from our own surveys (eg, see [<xref ref-type="bibr" rid="ref23">23</xref>]), as well as from studies conducted by other teams. This will allow future assessment of threats to validity associated with method, and whether information derived from different sources is complementary.</p>
            </sec>
            <sec>
                <title>Subscriber Participation</title>
                <p>The very skewed distribution of subscriber participation in this study is quite similar to findings reported by Falkman et al [<xref ref-type="bibr" rid="ref10">10</xref>], as well as Nonnecke and Preece [<xref ref-type="bibr" rid="ref29">29</xref>]. Using the language of Wenger et al [<xref ref-type="bibr" rid="ref30">30</xref>], Falkman and colleagues describe three groups according to their level of participation: a core group of leaders, an active group who regularly participate, and a disproportionately large group of members on the periphery. Presumably, the 5% of dental practitioners in this study who posted about half of the messages to the online discussion list were the leaders of their virtual CoP. The middle group varied considerably in their degree of participation, but they did contribute to the message traffic. Arguably, the 46% of the subscribing practitioners who never posted messages during the study interval were the peripheral group of &#8220;lurkers&#8221; or bystanders.</p>
                <p>Interestingly, lurking on the periphery does not imply that the online community has little to offer this group. Even though passive, lurkers can still learn from core and active members who serve as information providers [<xref ref-type="bibr" rid="ref10">10</xref>,<xref ref-type="bibr" rid="ref11">11</xref>,<xref ref-type="bibr" rid="ref30">30</xref>]. In fact, peripheral participation may be essential for the viability of a CoP [<xref ref-type="bibr" rid="ref31">31</xref>] because lurking, even with its negative connotations, is &#8220;a form of participation that is both acceptable and beneficial to online groups&#8221; (p. 6, [<xref ref-type="bibr" rid="ref29">29</xref>]).</p>
                <p>The qualitative researchers in our group believe that clinical topics initiated and discussed by leaders and active members are probably of interest to members on the periphery. For one, they assume passive members read at least some of the messages delivered to them. They further assume that disaffected members will unsubscribe. To the extent that they are wrong, the topical domain that we have discovered may reflect the interests of core and active members rather than the entire CoP. Nevertheless, it seems reasonable to study this online dental community, as the pattern of participation is typical of other communities of practice and electronic discussion lists.</p>
            </sec>
            <sec>
                <title>Natural Language Processing</title>
                <p>To cope with the noisy and informal nature of email, we heavily processed the messages. In so doing, we may have inadvertently overlooked important content-bearing phrases by deriving collocations from a much-reduced set of tokens. Nevertheless, collocations are much more informative than frequent phrases [<xref ref-type="bibr" rid="ref18">18</xref>]. The latter are usually uninteresting, at least in this context, and seem to derive from ordinary language, repeated self-promotion, and banner advertisements. Despite our best efforts, we were unable to delete all of the text-based noise.</p>
                <p>Many of the messages include excerpts from news items, magazine articles, or research articles. These excerpts seem to have a disproportionate number of clinical phrases relative to message content written by subscribers. (Chew and Eysenbach [<xref ref-type="bibr" rid="ref26">26</xref>] identified a similar problem when analyzing the content of posts to Twitter (&#8220;tweets&#8221;; see [<xref ref-type="bibr" rid="ref32">32</xref>]) during the 2009 H1N1 pandemic. They cautioned that key phrases in spam and popular news might affect retrieval of tweets and activity over time.) Because we were unable to identify automatically all of the imported content, we analyzed the entire message after preprocessing. However, one could argue that members, especially leaders, bring in relevant text and that mining messages with imported text still leads to a reasonable set of NLP-derived phrases.</p>
            </sec>
            <sec>
                <title>Finding Relevant Messages: A Demonstration</title>
                <p>In this study, we demonstrate the potential usefulness of our procedures by retrieving a manageable set of relevant messages for qualitative researchers. Their research entails exploring dentists&#8217; knowledge of the relationship between systemic and oral disease expressed in messages. To understand how they can work with messages sorted by category and type of match, consider the following scenario.</p>
                <p>Assume the researchers can handle about 300 messages for labor-intensive content analyses. They could design a broad or focused study by considering the number and type of match in each category. For example, for a broad study, they could analyze the 305 messages with clinical content-bearing phrases that we retrieved for the categories they had selected (see <xref ref-type="table" rid="table2">Table 2</xref>). For a more focused study, they could elect to work with messages from just the first category, <italic>systemic disease,</italic> which has 299 messages with 548 instances of phrases or keywords. Alternatively, they could select messages in some other combination of categories and type of match with the constraint that the total number of messages to analyze is about 300. If they decide to add a clinician to the team or devote more time to the project, they could analyze a larger set of messages.</p>
                <p>By sorting the messages we retrieved into the categories selected a priori by the qualitative researchers, we were able to create a useful database that encourages flexible investigation.</p>
            </sec>
            <sec>
                <title>Limitations</title>
                <p>A major limitation of this study is that we used a single source to mine electronic messages. It is possible that the NLP-discovered phrases and their subsequent classification will not generalize to other communities. In other words, the topical domain that we discovered may not describe the clinical interests of other practitioners, such as dentists who prefer to remain offline. Even if our version of the topical domain is useful, we still need to assess whether and how it changes over time. Additionally, other methods such as latent semantic analysis, sometimes referred to as latent semantic indexing [<xref ref-type="bibr" rid="ref33">33</xref>-<xref ref-type="bibr" rid="ref35">35</xref>], could yield a different set of topics. Finally, although we took care to reach consensus when classifying phrases, other dental researchers could have seen a different structure. Nevertheless, the limitations of any feasibility study are offset by the potential for usefulness and discovery. We believe the limitations of this study can be addressed in the future with formal evaluations that compare methods and communities.</p>
            </sec>
            <sec>
                <title>Future Research</title>
                <p>Each step in the workflow presents opportunities for further research. Nevertheless, once the system we are developing becomes reasonably efficient and robust, a cost-benefit analysis will be appropriate. For example, we could compare the labor involved and quality of retrieval for a simple random sample of messages with ad hoc keyword searches as a baseline versus our system.</p>
                <p>Other methods to identify clinically relevant messages, such as summarization and clustering of similar summaries [<xref ref-type="bibr" rid="ref19">19</xref>,<xref ref-type="bibr" rid="ref36">36</xref>,<xref ref-type="bibr" rid="ref37">37</xref>], or use of an ontology to enable retrieval (eg, see [<xref ref-type="bibr" rid="ref38">38</xref>]) could be worthwhile. Also, discourse analysis [<xref ref-type="bibr" rid="ref18">18</xref>] of the threaded messages could help us better understand how clinicians respond to the information needs of their peers, and whether the shared information is in keeping with the best evidence in published guidelines.</p>
                <p>Ultimately, this program of research will help us improve knowledge transfer of useful information for the legions of dentists who practice in relative isolation.</p>
            </sec>
        </sec>
    </body>
    <back>
        <ack>
            <p>We would like to thank David S. Dodell, DMD, for his help in obtaining the messages posted to the discussion list. The Pittsburgh Biomedical Informatics Training Program 5T15LM007059 partially supported this research.</p>
        </ack>
        <fn-group>
            <fn fn-type="conflict">
                <p>None declared</p>
            </fn>
        </fn-group>
        <app-group>
            <app id="app1">
                <title>Multimedia Appendix 1</title>
                <p>Classification of dental phrases with keywords. A table of keywords by category is displayed at the end of the classification.</p>
                <media xlink:href="jmir_v13i4e98_app1.pdf" xlink:title="PDF file (Adobe PDF File), 149 KB" />
            </app>
        </app-group>
        <glossary>
            <title>Abbreviations</title>
            <def-list>
                <def-item>
                    <term id="abb1">CoP</term>
                    <def>
                        <p>community of practice</p>
                    </def>
                </def-item>
                <def-item>
                    <term id="abb2">NLP</term>
                    <def>
                        <p>natural language processing</p>
                    </def>
                </def-item>
                <def-item>
                    <term id="abb3">NLTK</term>
                    <def>
                        <p>Natural Language Toolkit</p>
                    </def>
                </def-item>
            </def-list>
        </glossary>
        <ref-list>
            <ref id="ref1">
                <label>1</label>
                <nlm-citation citation-type="journal">
                    <person-group person-group-type="author">
                        <name name-style="western">
                            <surname>Curro</surname>
                            <given-names>FA</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Craig</surname>
                            <given-names>RG</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Van Thompson</surname>
                            <given-names>P</given-names>
                        </name>
                    </person-group>
                    <article-title>Practice-based research networks and their impact on dentistry: creating a pathway for change in the profession</article-title>
                    <source>Compend Contin Educ Dent</source>
                    <year>2009</year>
                    <month>05</month>
                    <volume>30</volume>
                    <issue>4</issue>
                    <fpage>184, 186</fpage>
                    <lpage>7</lpage>
                    <pub-id pub-id-type="medline">19441734</pub-id>
                    <pub-id pub-id-type="pmcid">PMC2698714</pub-id>
                </nlm-citation>
            </ref>
            <ref id="ref2">
                <label>2</label>
                <nlm-citation citation-type="web">
                    <person-group person-group-type="author">
                        <collab>anonymous</collab>
                    </person-group>
                    <source>Health Sciences Library System</source>
                    <year>2011</year>
                    <access-date>2011-05-11</access-date>
                    <comment>Pitt Resources Quick Search<ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://www.hsls.pitt.edu/">http://www.hsls.pitt.edu/</ext-link>
                    </comment>
                    <pub-id pub-id-type="other">5ybUzxPzl</pub-id>
                </nlm-citation>
            </ref>
            <ref id="ref3">
                <label>3</label>
                <nlm-citation citation-type="journal">
                    <person-group person-group-type="author">
                        <name name-style="western">
                            <surname>Song</surname>
                            <given-names>M</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Spallek</surname>
                            <given-names>H</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Polk</surname>
                            <given-names>D</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Schleyer</surname>
                            <given-names>T</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Wali</surname>
                            <given-names>T</given-names>
                        </name>
                    </person-group>
                    <article-title>How information systems should support the information needs of general dentists in clinical settings: suggestions from a qualitative study</article-title>
                    <source>BMC Med Inform Decis Mak</source>
                    <year>2010</year>
                    <volume>10</volume>
                    <fpage>7</fpage>
                    <comment>
                        <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://www.biomedcentral.com/1472-6947/10/7" />
                    </comment>
                    <pub-id pub-id-type="doi">10.1186/1472-6947-10-7</pub-id>
                    <pub-id pub-id-type="medline">20122272</pub-id>
                    <pub-id pub-id-type="pii">1472-6947-10-7</pub-id>
                    <pub-id pub-id-type="pmcid">PMC2843644</pub-id>
                </nlm-citation>
            </ref>
            <ref id="ref4">
                <label>4</label>
                <nlm-citation citation-type="web">
                    <person-group person-group-type="author">
                        <collab>PubMed.gov</collab>
                    </person-group>
                    <source>US National Library of Medicine, National Institutes of Health</source>
                    <year>2011</year>
                    <access-date>2011-05-11</access-date>
                    <comment>pubMed<ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://www.ncbi.nlm.nih.gov/pubmed/">http://www.ncbi.nlm.nih.gov/pubmed/</ext-link>
                    </comment>
                    <pub-id pub-id-type="other">5ybVxMtSC</pub-id>
                </nlm-citation>
            </ref>
            <ref id="ref5">
                <label>5</label>
                <nlm-citation citation-type="web">
                    <person-group person-group-type="author">
                        <collab>American Dental Association</collab>
                    </person-group>
                    <source>ADA</source>
                    <year>2011</year>
                    <access-date>2011-05-11</access-date>
                    <comment>Fees for Members<ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://www.ada.org/3791.aspx">http://www.ada.org/3791.aspx</ext-link>
                    </comment>
                    <pub-id pub-id-type="other">5ybXRHA5P</pub-id>
                </nlm-citation>
            </ref>
            <ref id="ref6">
                <label>6</label>
                <nlm-citation citation-type="journal">
                    <person-group person-group-type="author">
                        <name name-style="western">
                            <surname>Landry</surname>
                            <given-names>CF</given-names>
                        </name>
                    </person-group>
                    <article-title>Work roles, tasks, and the information behavior of dentists</article-title>
                    <source>J Am Soc Inf Sci Technol</source>
                    <year>2006</year>
                    <volume>57</volume>
                    <issue>14</issue>
                    <fpage>1896</fpage>
                    <lpage>908</lpage>
                    <pub-id pub-id-type="doi">10.002/asi.20385</pub-id>
                </nlm-citation>
            </ref>
            <ref id="ref7">
                <label>7</label>
                <nlm-citation citation-type="journal">
                    <person-group person-group-type="author">
                        <name name-style="western">
                            <surname>Schleyer</surname>
                            <given-names>TK</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Forrest</surname>
                            <given-names>JL</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Kenney</surname>
                            <given-names>R</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Dodell</surname>
                            <given-names>DS</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Dovgy</surname>
                            <given-names>NA</given-names>
                        </name>
                    </person-group>
                    <article-title>Is the Internet useful for clinical practice?</article-title>
                    <source>J Am Dent Assoc</source>
                    <year>1999</year>
                    <month>10</month>
                    <volume>130</volume>
                    <issue>10</issue>
                    <fpage>1501</fpage>
                    <lpage>11</lpage>
                    <comment>
                        <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://jada.ada.org/cgi/pmidlookup?view=long&#38;pmid=10570599" />
                    </comment>
                    <pub-id pub-id-type="medline">10570599</pub-id>
                </nlm-citation>
            </ref>
            <ref id="ref8">
                <label>8</label>
                <nlm-citation citation-type="journal">
                    <person-group person-group-type="author">
                        <name name-style="western">
                            <surname>Strother</surname>
                            <given-names>EA</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Lancaster</surname>
                            <given-names>DM</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Gardiner</surname>
                            <given-names>J</given-names>
                        </name>
                    </person-group>
                    <article-title>Information needs of practicing dentists</article-title>
                    <source>Bull Med Libr Assoc</source>
                    <year>1986</year>
                    <month>07</month>
                    <volume>74</volume>
                    <issue>3</issue>
                    <fpage>227</fpage>
                    <lpage>30</lpage>
                    <pub-id pub-id-type="medline">3742116</pub-id>
                    <pub-id pub-id-type="pmcid">PMC227838</pub-id>
                </nlm-citation>
            </ref>
            <ref id="ref9">
                <label>9</label>
                <nlm-citation citation-type="web">
                    <person-group person-group-type="author">
                        <name name-style="western">
                            <surname>Spallek</surname>
                            <given-names>H</given-names>
                        </name>
                    </person-group>
                    <source>National Institue of Dental &#38; Craniofacial Research, National Institutes of Health, Research Portfolio Online Reporting Tools (RePORT)</source>
                    <year>2011</year>
                    <month>05</month>
                    <day>11</day>
                    <comment>Implementing Research Findings and Evidence-Based Interventiona Into Real-World Dental Practice Settings<ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://projectreporter.nih.gov/project_info_description.cfm?aid=8028402&#38;icde=8077609">http://projectreporter.nih.gov/project_info_description.cfm?aid=8028402&#38;icde=8077609</ext-link>
                    </comment>
                    <pub-id pub-id-type="other">5ybZrgg6B</pub-id>
                </nlm-citation>
            </ref>
            <ref id="ref10">
                <label>10</label>
                <nlm-citation citation-type="journal">
                    <person-group person-group-type="author">
                        <name name-style="western">
                            <surname>Falkman</surname>
                            <given-names>G</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Gustafsson</surname>
                            <given-names>M</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Jontell</surname>
                            <given-names>M</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Torgersson</surname>
                            <given-names>O</given-names>
                        </name>
                    </person-group>
                    <article-title>SOMWeb: a semantic web-based system for supporting collaboration of distributed medical communities of practice</article-title>
                    <source>J Med Internet Res</source>
                    <year>2008</year>
                    <volume>10</volume>
                    <issue>3</issue>
                    <fpage>e25</fpage>
                    <comment>
                        <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://www.jmir.org/2008/3/e25/" />
                    </comment>
                    <pub-id pub-id-type="doi">10.2196/jmir.1059</pub-id>
                    <pub-id pub-id-type="medline">18725355</pub-id>
                    <pub-id pub-id-type="pii">v10i3e25</pub-id>
                    <pub-id pub-id-type="pmcid">PMC2626431</pub-id>
                </nlm-citation>
            </ref>
            <ref id="ref11">
                <label>11</label>
                <nlm-citation citation-type="journal">
                    <person-group person-group-type="author">
                        <name name-style="western">
                            <surname>Spallek</surname>
                            <given-names>H</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Irwin</surname>
                            <given-names>JY</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Schleyer</surname>
                            <given-names>T</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Butler</surname>
                            <given-names>BS</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Weiss</surname>
                            <given-names>PM</given-names>
                        </name>
                    </person-group>
                    <article-title>Supporting the emergence of dental informatics with an online community</article-title>
                    <source>Int J Comput Dent</source>
                    <year>2007</year>
                    <month>07</month>
                    <volume>10</volume>
                    <issue>3</issue>
                    <fpage>247</fpage>
                    <lpage>64</lpage>
                    <pub-id pub-id-type="medline">18271498</pub-id>
                    <pub-id pub-id-type="pmcid">PMC2367256</pub-id>
                </nlm-citation>
            </ref>
            <ref id="ref12">
                <label>12</label>
                <nlm-citation citation-type="journal">
                    <person-group person-group-type="author">
                        <name name-style="western">
                            <surname>Gilbert</surname>
                            <given-names>GH</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Williams</surname>
                            <given-names>OD</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Rindal</surname>
                            <given-names>DB</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Pihlstrom</surname>
                            <given-names>DJ</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Benjamin</surname>
                            <given-names>PL</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Wallace</surname>
                            <given-names>MC</given-names>
                        </name>
                        <collab>DPBRN Collaborative Group</collab>
                    </person-group>
                    <article-title>The creation and development of the dental practice-based research network</article-title>
                    <source>J Am Dent Assoc</source>
                    <year>2008</year>
                    <month>01</month>
                    <volume>139</volume>
                    <issue>1</issue>
                    <fpage>74</fpage>
                    <lpage>81</lpage>
                    <comment>
                        <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://jada.ada.org/cgi/pmidlookup?view=long&#38;pmid=18167389" />
                    </comment>
                    <pub-id pub-id-type="medline">18167389</pub-id>
                    <pub-id pub-id-type="pii">139/1/74</pub-id>
                </nlm-citation>
            </ref>
            <ref id="ref13">
                <label>13</label>
                <nlm-citation citation-type="journal">
                    <person-group person-group-type="author">
                        <name name-style="western">
                            <surname>Lindbloom</surname>
                            <given-names>EJ</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Ewigman</surname>
                            <given-names>BG</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Hickner</surname>
                            <given-names>JM</given-names>
                        </name>
                    </person-group>
                    <article-title>Practice-based research networks: the laboratories of primary care research</article-title>
                    <source>Med Care</source>
                    <year>2004</year>
                    <month>04</month>
                    <volume>42</volume>
                    <issue>4 Suppl</issue>
                    <fpage>III45</fpage>
                    <lpage>9</lpage>
                    <pub-id pub-id-type="medline">15026664</pub-id>
                    <pub-id pub-id-type="pii">00005650-200404001-00008</pub-id>
                </nlm-citation>
            </ref>
            <ref id="ref14">
                <label>14</label>
                <nlm-citation citation-type="book">
                    <person-group person-group-type="author">
                        <name name-style="western">
                            <surname>Roberts</surname>
                            <given-names>CW</given-names>
                        </name>
                    </person-group>
                    <person-group person-group-type="editor">
                        <name name-style="western">
                            <surname>Smelser</surname>
                            <given-names>NJ</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Baltes</surname>
                            <given-names>PB</given-names>
                        </name>
                    </person-group>
                    <article-title>Content analysis</article-title>
                    <source>International Encyclopedia of the Social &#38; Behavioral Sciences</source>
                    <year>2001</year>
                    <publisher-loc>Oxford, UK</publisher-loc>
                    <publisher-name>Pergamon</publisher-name>
                    <fpage>2697</fpage>
                    <lpage>702</lpage>
                </nlm-citation>
            </ref>
            <ref id="ref15">
                <label>15</label>
                <nlm-citation citation-type="confproc">
                    <person-group person-group-type="author">
                        <name name-style="western">
                            <surname>Bekhuis</surname>
                            <given-names>T</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Kreinacke</surname>
                            <given-names>M</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Spallek</surname>
                            <given-names>H</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Song</surname>
                            <given-names>M</given-names>
                        </name>
                    </person-group>
                    <article-title>Using the Natural Language Toolkit to reduce the number of messages for in-depth content analyses: a case study</article-title>
                    <source>AMIA Annual Symposium Proceedings</source>
                    <year>2010</year>
                    <conf-name>AMIA 2010</conf-name>
                    <conf-date>Nov 13-17, 2010</conf-date>
                    <conf-loc>Washington, DC, USA</conf-loc>
                </nlm-citation>
            </ref>
            <ref id="ref16">
                <label>16</label>
                <nlm-citation citation-type="journal">
                    <person-group person-group-type="author">
                        <name name-style="western">
                            <surname>Eysenbach</surname>
                            <given-names>G</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Till</surname>
                            <given-names>JE</given-names>
                        </name>
                    </person-group>
                    <article-title>Ethical issues in qualitative research on internet communities</article-title>
                    <source>BMJ</source>
                    <year>2001</year>
                    <month>11</month>
                    <day>10</day>
                    <volume>323</volume>
                    <issue>7321</issue>
                    <fpage>1103</fpage>
                    <lpage>5</lpage>
                    <comment>
                        <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://bmj.com/cgi/pmidlookup?view=long&#38;pmid=11701577" />
                    </comment>
                    <pub-id pub-id-type="medline">11701577</pub-id>
                    <pub-id pub-id-type="pmcid">PMC59687</pub-id>
                </nlm-citation>
            </ref>
            <ref id="ref17">
                <label>17</label>
                <nlm-citation citation-type="journal">
                    <person-group person-group-type="author">
                        <name name-style="western">
                            <surname>de Vel</surname>
                            <given-names>O</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Anderson</surname>
                            <given-names>A</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Corney</surname>
                            <given-names>M</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Mohay</surname>
                            <given-names>G</given-names>
                        </name>
                    </person-group>
                    <article-title>Mining e-mail content for author identification forensics</article-title>
                    <source>ACM SIGMOD Rec</source>
                    <year>2001</year>
                    <volume>30</volume>
                    <fpage>55</fpage>
                    <lpage>64</lpage>
                    <pub-id pub-id-type="doi">10.1145/604264.604272</pub-id>
                </nlm-citation>
            </ref>
            <ref id="ref18">
                <label>18</label>
                <nlm-citation citation-type="book">
                    <person-group person-group-type="author">
                        <name name-style="western">
                            <surname>Bird</surname>
                            <given-names>S</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Klein</surname>
                            <given-names>E</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Loper</surname>
                            <given-names>E</given-names>
                        </name>
                    </person-group>
                    <source>Natural Language Processing With Python</source>
                    <year>2009</year>
                    <publisher-loc>Sebastopol, CA</publisher-loc>
                    <publisher-name>O&#039;Reilly</publisher-name>
                </nlm-citation>
            </ref>
            <ref id="ref19">
                <label>19</label>
                <nlm-citation citation-type="book">
                    <person-group person-group-type="author">
                        <name name-style="western">
                            <surname>Katakis</surname>
                            <given-names>I</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Tsoumakas</surname>
                            <given-names>G</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Vlahavas</surname>
                            <given-names>I</given-names>
                        </name>
                    </person-group>
                    <person-group person-group-type="editor">
                        <name name-style="western">
                            <surname>Vakali</surname>
                            <given-names>A</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Pallis</surname>
                            <given-names>G</given-names>
                        </name>
                    </person-group>
                    <article-title>Email mining: emerging techniques for email management</article-title>
                    <source>Web Data Management Practices: Emerging Techniques And Technologies</source>
                    <year>2006</year>
                    <publisher-loc>Hershey, PA</publisher-loc>
                    <publisher-name>Idea Group Publishing</publisher-name>
                    <fpage>219</fpage>
                    <lpage>40</lpage>
                </nlm-citation>
            </ref>
            <ref id="ref20">
                <label>20</label>
                <nlm-citation citation-type="web">
                    <source>Natural Language Toolkit</source>
                    <access-date>2011-05-09</access-date>
                    <comment>Home page<ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://www.nltk.org/">http://www.nltk.org/</ext-link>
                    </comment>
                    <pub-id pub-id-type="other">5yYGW5pg6</pub-id>
                </nlm-citation>
            </ref>
            <ref id="ref21">
                <label>21</label>
                <nlm-citation citation-type="journal">
                    <person-group person-group-type="author">
                        <name name-style="western">
                            <surname>Dilevko</surname>
                            <given-names>J</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Gottlieb</surname>
                            <given-names>L</given-names>
                        </name>
                    </person-group>
                    <article-title>The relevance of classification theory to textual analysis</article-title>
                    <source>Libr Inf Sci Res</source>
                    <year>2009</year>
                    <volume>31</volume>
                    <fpage>92</fpage>
                    <lpage>100</lpage>
                    <pub-id pub-id-type="doi">10.1016/j.lisr.2009.01.001</pub-id>
                </nlm-citation>
            </ref>
            <ref id="ref22">
                <label>22</label>
                <nlm-citation citation-type="journal">
                    <person-group person-group-type="author">
                        <name name-style="western">
                            <surname>Duxbury</surname>
                            <given-names>AJ</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Leach</surname>
                            <given-names>FN</given-names>
                        </name>
                    </person-group>
                    <article-title>Drug information and the dental practitioner</article-title>
                    <source>Dent Update</source>
                    <year>1981</year>
                    <month>03</month>
                    <volume>8</volume>
                    <issue>2</issue>
                    <fpage>101</fpage>
                    <lpage>2, 106</lpage>
                    <pub-id pub-id-type="medline">6943113</pub-id>
                </nlm-citation>
            </ref>
            <ref id="ref23">
                <label>23</label>
                <nlm-citation citation-type="journal">
                    <person-group person-group-type="author">
                        <name name-style="western">
                            <surname>Spallek</surname>
                            <given-names>H</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Song</surname>
                            <given-names>M</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Polk</surname>
                            <given-names>DE</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Bekhuis</surname>
                            <given-names>T</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Frantsve-Hawley</surname>
                            <given-names>J</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Aravamudhan</surname>
                            <given-names>K</given-names>
                        </name>
                    </person-group>
                    <article-title>Barriers to implementing evidence-based clinical guidelines: a survey of early adopters</article-title>
                    <source>J Evid Based Dent Pract</source>
                    <year>2010</year>
                    <month>12</month>
                    <volume>10</volume>
                    <issue>4</issue>
                    <fpage>195</fpage>
                    <lpage>206</lpage>
                    <pub-id pub-id-type="doi">10.1016/j.jebdp.2010.05.013</pub-id>
                    <pub-id pub-id-type="medline">21093800</pub-id>
                    <pub-id pub-id-type="pii">S1532-3382(10)00131-4</pub-id>
                    <pub-id pub-id-type="pmcid">PMC3011934</pub-id>
                </nlm-citation>
            </ref>
            <ref id="ref24">
                <label>24</label>
                <nlm-citation citation-type="journal">
                    <person-group person-group-type="author">
                        <name name-style="western">
                            <surname>Freimuth</surname>
                            <given-names>VS</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Massett</surname>
                            <given-names>HA</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Meltzer</surname>
                            <given-names>W</given-names>
                        </name>
                    </person-group>
                    <article-title>A descriptive analysis of 10 years of research published in the Journal of Health Communication</article-title>
                    <source>J Health Commun</source>
                    <year>2006</year>
                    <volume>11</volume>
                    <issue>1</issue>
                    <fpage>11</fpage>
                    <lpage>20</lpage>
                    <pub-id pub-id-type="doi">10.1080/10810730500461042</pub-id>
                    <pub-id pub-id-type="medline">16546916</pub-id>
                    <pub-id pub-id-type="pii">J35518W2537UH715</pub-id>
                </nlm-citation>
            </ref>
            <ref id="ref25">
                <label>25</label>
                <nlm-citation citation-type="web">
                    <person-group person-group-type="author">
                        <name name-style="western">
                            <surname>Sterne</surname>
                            <given-names>J</given-names>
                        </name>
                    </person-group>
                    <source>SAS Institute</source>
                    <year>2010</year>
                    <access-date>2011-09-30</access-date>
                    <comment>Text Analytics for Social Media: Evolving Tools for an Evolving Environment<ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://www.sas.com/resources/whitepaper/wp_24091.pdf">http://www.sas.com/resources/whitepaper/wp_24091.pdf</ext-link>
                    </comment>
                    <pub-id pub-id-type="other">625lHnY2R</pub-id>
                </nlm-citation>
            </ref>
            <ref id="ref26">
                <label>26</label>
                <nlm-citation citation-type="journal">
                    <person-group person-group-type="author">
                        <name name-style="western">
                            <surname>Chew</surname>
                            <given-names>C</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Eysenbach</surname>
                            <given-names>G</given-names>
                        </name>
                    </person-group>
                    <article-title>Pandemics in the age of Twitter: content analysis of Tweets during the 2009 H1N1 outbreak</article-title>
                    <source>PLoS One</source>
                    <year>2010</year>
                    <volume>5</volume>
                    <issue>11</issue>
                    <fpage>e14118</fpage>
                    <comment>
                        <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://dx.plos.org/10.1371/journal.pone.0014118" />
                    </comment>
                    <pub-id pub-id-type="doi">10.1371/journal.pone.0014118</pub-id>
                    <pub-id pub-id-type="medline">21124761</pub-id>
                    <pub-id pub-id-type="pmcid">PMC2993925</pub-id>
                </nlm-citation>
            </ref>
            <ref id="ref27">
                <label>27</label>
                <nlm-citation citation-type="journal">
                    <person-group person-group-type="author">
                        <name name-style="western">
                            <surname>Eysenbach</surname>
                            <given-names>G</given-names>
                        </name>
                    </person-group>
                    <article-title>Infodemiology and infoveillance: framework for an emerging set of public health informatics methods to analyze search, communication and publication behavior on the Internet</article-title>
                    <source>J Med Internet Res</source>
                    <year>2009</year>
                    <volume>11</volume>
                    <issue>1</issue>
                    <fpage>e11</fpage>
                    <comment>
                        <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://www.jmir.org/2009/1/e11/" />
                    </comment>
                    <pub-id pub-id-type="doi">10.2196/jmir.1157</pub-id>
                    <pub-id pub-id-type="medline">19329408</pub-id>
                    <pub-id pub-id-type="pii">v11i1e11</pub-id>
                    <pub-id pub-id-type="pmcid">PMC2762766</pub-id>
                </nlm-citation>
            </ref>
            <ref id="ref28">
                <label>28</label>
                <nlm-citation citation-type="book">
                    <person-group person-group-type="author">
                        <name name-style="western">
                            <surname>Green</surname>
                            <given-names>J</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Thorogood</surname>
                            <given-names>N</given-names>
                        </name>
                    </person-group>
                    <person-group person-group-type="editor">
                        <name name-style="western">
                            <surname>Green</surname>
                            <given-names>JA</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Thorogood</surname>
                            <given-names>N</given-names>
                        </name>
                    </person-group>
                    <source>Qualitative Methods for Health Research (Introducing Qualitative Methods series). 2nd edition</source>
                    <year>2009</year>
                    <publisher-loc>Los Angeles, CA</publisher-loc>
                    <publisher-name>Sage</publisher-name>
                </nlm-citation>
            </ref>
            <ref id="ref29">
                <label>29</label>
                <nlm-citation citation-type="confproc">
                    <person-group person-group-type="author">
                        <name name-style="western">
                            <surname>Nonnecke</surname>
                            <given-names>B</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Preece</surname>
                            <given-names>J</given-names>
                        </name>
                    </person-group>
                    <article-title>Lurker demographics: counting the silent</article-title>
                    <source>Proceedings of CHI 2000: The Future is Here</source>
                    <year>2000</year>
                    <conf-name>SIGCHI Conference on Human Factors in Computing Systems</conf-name>
                    <conf-date>April 1-6, 2000</conf-date>
                    <conf-loc>Hague, Netherlands</conf-loc>
                    <publisher-name>Association of Computing Machinery (ACM)</publisher-name>
                    <pub-id pub-id-type="doi">10.1145/332040.409</pub-id>
                </nlm-citation>
            </ref>
            <ref id="ref30">
                <label>30</label>
                <nlm-citation citation-type="book">
                    <person-group person-group-type="author">
                        <name name-style="western">
                            <surname>Wenger</surname>
                            <given-names>E</given-names>
                        </name>
                        <name name-style="western">
                            <surname>McDermott</surname>
                            <given-names>R</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Snyder</surname>
                            <given-names>WM</given-names>
                        </name>
                    </person-group>
                    <source>Cultivating Communities of Practice: A Guide to Managing Knowledge</source>
                    <year>2002</year>
                    <publisher-loc>Boston, MA</publisher-loc>
                    <publisher-name>Harvard Business School Press</publisher-name>
                </nlm-citation>
            </ref>
            <ref id="ref31">
                <label>31</label>
                <nlm-citation citation-type="journal">
                    <person-group person-group-type="author">
                        <name name-style="western">
                            <surname>Spallek</surname>
                            <given-names>H</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Butler</surname>
                            <given-names>BS</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Schleyer</surname>
                            <given-names>TK</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Weiss</surname>
                            <given-names>PM</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Wang</surname>
                            <given-names>X</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Thyvalikakath</surname>
                            <given-names>TP</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Hatala</surname>
                            <given-names>CL</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Naderi</surname>
                            <given-names>RA</given-names>
                        </name>
                    </person-group>
                    <article-title>Supporting emerging disciplines with e-communities: needs and benefits</article-title>
                    <source>J Med Internet Res</source>
                    <year>2008</year>
                    <volume>10</volume>
                    <issue>2</issue>
                    <fpage>e19</fpage>
                    <comment>
                        <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://www.jmir.org/2008/2/e19/" />
                    </comment>
                    <pub-id pub-id-type="doi">10.2196/jmir.971</pub-id>
                    <pub-id pub-id-type="medline">18653443</pub-id>
                    <pub-id pub-id-type="pii">v10i2e19</pub-id>
                    <pub-id pub-id-type="pmcid">PMC2483921</pub-id>
                </nlm-citation>
            </ref>
            <ref id="ref32">
                <label>32</label>
                <nlm-citation citation-type="web">
                    <source>Twitter, Inc</source>
                    <year>2011</year>
                    <access-date>2011-05-22</access-date>
                    <comment>twitter<ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://twitter.com/">http://twitter.com/</ext-link>
                    </comment>
                    <pub-id pub-id-type="other">5ysGmdFEz</pub-id>
                </nlm-citation>
            </ref>
            <ref id="ref33">
                <label>33</label>
                <nlm-citation citation-type="journal">
                    <person-group person-group-type="author">
                        <name name-style="western">
                            <surname>Bellegarda</surname>
                            <given-names>J</given-names>
                        </name>
                    </person-group>
                    <article-title>Exploiting latent semantic information in statistical language modeling</article-title>
                    <source>Proc IEEE</source>
                    <year>2000</year>
                    <volume>88</volume>
                    <issue>8</issue>
                    <fpage>1279</fpage>
                    <lpage>96</lpage>
                    <pub-id pub-id-type="doi">10.1109/5.880084</pub-id>
                </nlm-citation>
            </ref>
            <ref id="ref34">
                <label>34</label>
                <nlm-citation citation-type="book">
                    <person-group person-group-type="author">
                        <name name-style="western">
                            <surname>Hand</surname>
                            <given-names>D</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Mannila</surname>
                            <given-names>H</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Smyth</surname>
                            <given-names>P</given-names>
                        </name>
                    </person-group>
                    <source>Principles of Data Mining</source>
                    <year>2001</year>
                    <publisher-loc>Cambridge, MA</publisher-loc>
                    <publisher-name>MIT Press</publisher-name>
                </nlm-citation>
            </ref>
            <ref id="ref35">
                <label>35</label>
                <nlm-citation citation-type="book">
                    <person-group person-group-type="author">
                        <name name-style="western">
                            <surname>Manning</surname>
                            <given-names>CD</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Raghavan</surname>
                            <given-names>P</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Schutze</surname>
                            <given-names>H</given-names>
                        </name>
                    </person-group>
                    <source>Introduction to Information Retrieval</source>
                    <year>2008</year>
                    <publisher-loc>New York, NY</publisher-loc>
                    <publisher-name>Cambridge University Press</publisher-name>
                </nlm-citation>
            </ref>
            <ref id="ref36">
                <label>36</label>
                <nlm-citation citation-type="confproc">
                    <person-group person-group-type="author">
                        <name name-style="western">
                            <surname>Radev</surname>
                            <given-names>DR</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Teufel</surname>
                            <given-names>S</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Saggion</surname>
                            <given-names>H</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Lam</surname>
                            <given-names>W</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Blitzer</surname>
                            <given-names>J</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Qi</surname>
                            <given-names>H</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Celebi</surname>
                            <given-names>A</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Liu</surname>
                            <given-names>D</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Drabek</surname>
                            <given-names>E</given-names>
                        </name>
                    </person-group>
                    <article-title>Evaluation challenges in large-scale document summarization</article-title>
                    <source>ACL '03</source>
                    <year>2003</year>
                    <conf-name>41st Annual Meeting of the Association for Computational Linguistics</conf-name>
                    <conf-date>July 7-12, 2003</conf-date>
                    <conf-loc>Sapporo, Japan</conf-loc>
                    <pub-id pub-id-type="doi">10.3115/1075096.1075144</pub-id>
                </nlm-citation>
            </ref>
            <ref id="ref37">
                <label>37</label>
                <nlm-citation citation-type="confproc">
                    <person-group person-group-type="author">
                        <name name-style="western">
                            <surname>Saggion</surname>
                            <given-names>H</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Gaizauskas</surname>
                            <given-names>R</given-names>
                        </name>
                    </person-group>
                    <article-title>Multi-document summarization by cluster/profile relevance and redundancy removal</article-title>
                    <source>Proceedings</source>
                    <year>2004</year>
                    <conf-name>Document Understanding Conference (DUC04) 2004</conf-name>
                    <conf-date>May 6-7, 2004</conf-date>
                    <conf-loc>Boston, MA, USA</conf-loc>
                </nlm-citation>
            </ref>
            <ref id="ref38">
                <label>38</label>
                <nlm-citation citation-type="journal">
                    <person-group person-group-type="author">
                        <name name-style="western">
                            <surname>Konovalov</surname>
                            <given-names>S</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Scotch</surname>
                            <given-names>M</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Post</surname>
                            <given-names>L</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Brandt</surname>
                            <given-names>C</given-names>
                        </name>
                    </person-group>
                    <article-title>Biomedical informatics techniques for processing and analyzing web blogs of military service members</article-title>
                    <source>J Med Internet Res</source>
                    <year>2010</year>
                    <volume>12</volume>
                    <issue>4</issue>
                    <fpage>e45</fpage>
                    <comment>
                        <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://www.jmir.org/2010/4/e45/" />
                    </comment>
                    <pub-id pub-id-type="doi">10.2196/jmir.1538</pub-id>
                    <pub-id pub-id-type="medline">20923755</pub-id>
                    <pub-id pub-id-type="pii">v12i4e45</pub-id>
                </nlm-citation>
            </ref>
        </ref-list>
    </back>
</article>
