<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article" dtd-version="2.0">
    <front>
        <journal-meta>
            <journal-id journal-id-type="publisher-id">JMIR</journal-id>
            <journal-id journal-id-type="nlm-ta">J Med Internet Res</journal-id>
            <journal-title>Journal of Medical Internet Research</journal-title>
            <issn pub-type="epub">1438-8871</issn>
            <publisher>
                <publisher-name>JMIR Publications Inc.</publisher-name>
                <publisher-loc>Toronto, Canada</publisher-loc>
            </publisher>
        </journal-meta>
        <article-meta>
            <article-id pub-id-type="publisher-id">v18i2e41</article-id>
            <article-id pub-id-type="pmid">26920122</article-id>
            <article-id pub-id-type="doi">10.2196/jmir.4738</article-id>
            <article-categories>
                <subj-group subj-group-type="heading">
                    <subject>Original Paper</subject>
                </subj-group>
                <subj-group subj-group-type="article-type">
                    <subject>Original Paper</subject>
                </subj-group>
            </article-categories>
            <title-group>
                <article-title>Garbage in, Garbage Out: Data Collection, Quality Assessment and Reporting Standards for Social Media Data Use in Health Research, Infodemiology and Digital Disease Detection</article-title>
            </title-group>
            <contrib-group>
                <contrib contrib-type="editor">
                    <name>
                        <surname>Eysenbach</surname>
                        <given-names>Gunther</given-names>
                    </name>
                </contrib>
            </contrib-group>
            <contrib-group>
                <contrib contrib-type="reviewer">
                    <name>
                        <surname>Chu</surname>
                        <given-names>Kar-Hai</given-names>
                    </name>
                </contrib>
                <contrib contrib-type="reviewer">
                    <name>
                        <surname>Chen</surname>
                        <given-names>Annie</given-names>
                    </name>
                </contrib>
                <contrib contrib-type="reviewer">
                    <name>
                        <surname>Zhang</surname>
                        <given-names>Ni</given-names>
                    </name>
                </contrib>
            </contrib-group>
            <contrib-group>
                <contrib contrib-type="author" id="contrib1" corresp="yes">
                    <name name-style="western">
                        <surname>Kim</surname>
                        <given-names>Yoonsang</given-names>
                    </name>
                    <degrees>PhD</degrees>
                    <xref rid="aff1" ref-type="aff">1</xref>
                    <address>
                        <institution>Health Media Collaboratory</institution>
                        <institution>Institute for Health Research and Policy</institution>
                        <institution>University of Illinois at Chicago</institution>
                        <addr-line>Westside Research Office Building, M/C 275</addr-line>
                        <addr-line>1747 W Roosevelt Rd</addr-line>
                        <addr-line>Chicago, IL, 60608</addr-line>
                        <country>United States</country>
                        <phone>1 312 413 7596</phone>
                        <fax>1 312 996 2703</fax>
                        <email>ykim96@uic.edu</email>
                    </address>
                    <ext-link ext-link-type="orcid">http://orcid.org/0000-0002-1685-1753</ext-link>
                </contrib>
                <contrib contrib-type="author" id="contrib2">
                    <name name-style="western">
                        <surname>Huang</surname>
                        <given-names>Jidong</given-names>
                    </name>
                    <degrees>PhD</degrees>
                    <xref rid="aff1" ref-type="aff">1</xref>
                    <ext-link ext-link-type="orcid">http://orcid.org/0000-0002-1646-5422</ext-link>
                </contrib>
                <contrib contrib-type="author" id="contrib3">
                    <name name-style="western">
                        <surname>Emery</surname>
                        <given-names>Sherry</given-names>
                    </name>
                    <degrees>PhD</degrees>
                    <xref rid="aff1" ref-type="aff">1</xref>
                    <ext-link ext-link-type="orcid">http://orcid.org/0000-0001-9278-9990</ext-link>
                </contrib>
            </contrib-group>
            <aff id="aff1">
                <sup>1</sup>
                <institution>Health Media Collaboratory</institution>
                <institution>Institute for Health Research and Policy</institution>
                <institution>University of Illinois at Chicago</institution>
                <addr-line>Chicago, IL</addr-line>
                <country>United States</country>
            </aff>
            <author-notes>
                <corresp>Corresponding Author: Yoonsang Kim <email>ykim96@uic.edu</email>
                </corresp>
            </author-notes>
            <pub-date pub-type="collection">
                <month>02</month>
                <year>2016</year>
            </pub-date>
            <pub-date pub-type="epub">
                <day>26</day>
                <month>02</month>
                <year>2016</year>
            </pub-date>
            <volume>18</volume>
            <issue>2</issue>
            <elocation-id>e41</elocation-id>
            <!--history from ojs - api-xml-->
            <history>
                <date date-type="received">
                    <day>21</day>
                    <month>5</month>
                    <year>2015</year>
                </date>
                <date date-type="rev-request">
                    <day>18</day>
                    <month>10</month>
                    <year>2015</year>
                </date>
                <date date-type="rev-recd">
                    <day>9</day>
                    <month>12</month>
                    <year>2015</year>
                </date>
                <date date-type="accepted">
                    <day>4</day>
                    <month>1</month>
                    <year>2016</year>
                </date>
            </history>
            <!--(c) the authors - correct author names and publication date here if necessary. Date in form ', dd.mm.yyyy' after jmir.org-->
            <copyright-statement>&#169;Yoonsang Kim, Jidong Huang, Sherry Emery. Originally published in the Journal of Medical Internet Research (http://www.jmir.org), 26.02.2016. </copyright-statement>
            <copyright-year>2016</copyright-year>
            <license license-type="open-access" xlink:href="http://creativecommons.org/licenses/by/2.0/">
                <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (http://creativecommons.org/licenses/by/2.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in the Journal of Medical Internet Research, is properly cited. The complete bibliographic information, a link to the original publication on http://www.jmir.org/, as well as this copyright and license information must be included.</p>
            </license>
            <self-uri xlink:href="http://www.jmir.org/2016/2/e41/" xlink:type="simple" />
<related-article related-article-type="commentary" vol="8" page="e219" xlink:href="http://www.jmir.org/2016/8/e219/" xlink:type="simple" />
            
<related-article related-article-type="commentary" vol="19" page="e165" xlink:href="http://www.jmir.org/2017/6/e165/" xlink:type="simple"/>

<abstract>
                <sec sec-type="background">
                    <title>Background</title>
                    <p>Social media have transformed the communications landscape. People increasingly obtain news and health information online and via social media. Social media platforms also serve as novel sources of rich observational data for health research (including infodemiology, infoveillance, and digital disease detection detection). While the number of studies using social data is growing rapidly, very few of these studies transparently outline their methods for collecting, filtering, and reporting those data. Keywords and search filters applied to social data form the lens through which researchers may observe what and how people communicate about a given topic. Without a properly focused lens, research conclusions may be biased or misleading. Standards of reporting data sources and quality are needed so that data scientists and consumers of social media research can evaluate and compare methods and findings across studies.</p>
                </sec>
                <sec sec-type="objective">
                    <title>Objective</title>
                    <p>We aimed to develop and apply a framework of social media data collection and quality assessment and to propose a reporting standard, which researchers and reviewers may use to evaluate and compare the quality of social data across studies.</p>
                </sec>
                <sec sec-type="methods">
                    <title>Methods</title>
                    <p>We propose a conceptual framework consisting of three major steps in collecting social media data: develop, apply, and validate search filters. This framework is based on two criteria: retrieval precision (how much of retrieved data is relevant) and retrieval recall (how much of the relevant data is retrieved). We then discuss two conditions that estimation of retrieval precision and recall rely on&#8212;accurate human coding and full data collection&#8212;and how to calculate these statistics in cases that deviate from the two ideal conditions. We then apply the framework on a real-world example using approximately 4 million tobacco-related tweets collected from the Twitter firehose.</p>
                </sec>
                <sec sec-type="results">
                    <title>Results</title>
                    <p>We developed and applied a search filter to retrieve e-cigarette&#8211;related tweets from the archive based on three keyword categories: devices, brands, and behavior. The search filter retrieved 82,205 e-cigarette&#8211;related tweets from the archive and was validated. Retrieval precision was calculated above 95% in all cases. Retrieval recall was 86% assuming ideal conditions (no human coding errors and full data collection), 75% when unretrieved messages could not be archived, 86% assuming no false negative errors by coders, and 93% allowing both false negative and false positive errors by human coders.</p>
                </sec>
                <sec sec-type="conclusions">
                    <title>Conclusions</title>
                    <p>This paper sets forth a conceptual framework for the filtering and quality evaluation of social data that addresses several common challenges and moves toward establishing a standard of reporting social data. Researchers should clearly delineate data sources, how data were accessed and collected, and the search filter building process and how retrieval precision and recall were calculated. The proposed framework can be adapted to other public social media platforms.</p>
                </sec>
            </abstract>
            <kwd-group>
                <kwd>social media</kwd>
                <kwd>precision and recall</kwd>
                <kwd>sensitivity and specificity</kwd>
                <kwd>search filter</kwd>
                <kwd>Twitter</kwd>
                <kwd>standard reporting</kwd>
                <kwd>infodemiology</kwd>
                <kwd>infoveillance</kwd>
                <kwd>digital disease detection</kwd>
            </kwd-group>
        </article-meta>
    </front>
    <body>
        <sec sec-type="introduction">
            <title>Introduction</title>
            <p>Social media have transformed public and interpersonal communications. The Internet and social media have quickly become major sources of health information [<xref ref-type="bibr" rid="ref1">1</xref>-<xref ref-type="bibr" rid="ref3">3</xref>], providing both broad and targeted exposure to such information as well as facilitating information-seeking and sharing. As people increasingly turn to social media for news and information [<xref ref-type="bibr" rid="ref4">4</xref>,<xref ref-type="bibr" rid="ref5">5</xref>], these platforms can serve as novel sources of observational data for infodemiology, public health surveillance (infoveillance, digital disease detection) [<xref ref-type="bibr" rid="ref6">6</xref>-<xref ref-type="bibr" rid="ref11">11</xref>], tracking health attitudes and behavioral intention [<xref ref-type="bibr" rid="ref6">6</xref>, <xref ref-type="bibr" rid="ref6">7</xref>, <xref ref-type="bibr" rid="ref9">9</xref>,<xref ref-type="bibr" rid="ref12">12</xref>-<xref ref-type="bibr" rid="ref16">16</xref>], and measuring community-level psychological characteristics related to health outcomes [<xref ref-type="bibr" rid="ref17">17</xref>,<xref ref-type="bibr" rid="ref18">18</xref>].</p>
            <p>While Facebook remains the most commonly used social media platform, varying privacy settings and complex application programming interface (API) streams make the collection and interpretation of Facebook data for observational research extremely challenging. In contrast, Twitter, which is by nature a much more public-facing platform, has millions of active users who provide rich qualitative data in the content of microblog messages (tweets) as well as important quantitative data embedded in the metadata. Metadata fields describe the reach and patterns of the diffusion of a given message, along with some limited characteristics of the users posting messages. Similarly, YouTube has millions of active users who view, post, rate, and comment on its rich video content and advertising. A simple search of any social media platform can provide a tantalizing bounty of information. Yet despite the rich potential of these platforms for research and analysis, methods for collecting, cleaning, and reporting social media data can vary widely, making the evaluation and comparison of studies using those data difficult at best.</p>
            <p>Social media data collection in infodemiology is usually defined by the keywords and search filters used to retrieve data from the platform [<xref ref-type="bibr" rid="ref6">6</xref>]. As such, search filters are the lens through which we can observe what and how people communicate. If our lens is appropriately focused, we can identify content of interest and avoid collecting a lot of irrelevant information. Conversely, if our search is too narrow, we may miss important data and our conclusions may be biased. If it is too broad, we risk collecting a lot of irrelevant and potentially misleading material.</p>
            <p>A search filter is a set of keywords integrated with search rules that specify search strategies. While there is an intuitive simplicity in identifying keywords and search rules for a given research question, that seeming simplicity is deceptive. First, keyword selection is not simple. Language and popular culture vary by age, socioeconomic status, race/ethnicity, geographic location, etc. The language used on social media is often colloquial, creative, and varying. Further, users communicate differently across platforms, partly driven by the norms and technical constraints unique to each platform, and partly driven by the social function of each platform [<xref ref-type="bibr" rid="ref19">19</xref>]. For example, Twitter users are limited to 140 characters and typically post short messages using abbreviations and slang terms. Facebook posts can be longer and thus are more likely to contain multiple, different words for a single construct. YouTube videos are posted with titles and often tagged by the poster with keywords. Instagram posts typically have multiple hashtags that offer some indication of the content. If a researcher is not fluent&#8212;or at least familiar&#8212;with the language norms of a particular platform, their search filter may be overly broad, too narrow, or simply off-topic.</p>
            <p>The keyword is only one part of a filter; without practical rules, an intuitive search term can retrieve a lot of irrelevant information. For example, in tobacco research, the term &#8220;smoking&#8221; is critically important to any search for relevant content. But without further rules to refine that term, the keyword will retrieve plenty of content about &#8220;smoking marijuana,&#8221; &#8220;smoking ribs,&#8221; and &#8220;smoking hot girls&#8221; [<xref ref-type="bibr" rid="ref9">9</xref>,<xref ref-type="bibr" rid="ref12">12</xref>]. A sentiment analysis of data retrieved with the broad &#8220;smoking&#8221; term would produce different results from data retrieved with a search filter that excluded other key terms that appear in close proximity to &#8220;smoking.&#8221; Therefore, developing reliable search filters requires a rigorous process to weed out irrelevant content and assure high-quality data collection [<xref ref-type="bibr" rid="ref20">20</xref>].</p>
            <p>While many studies have reported lists of keywords used to retrieve social data [<xref ref-type="bibr" rid="ref7">7</xref>-<xref ref-type="bibr" rid="ref10">10</xref>,<xref ref-type="bibr" rid="ref12">12</xref>-<xref ref-type="bibr" rid="ref16">16</xref>,<xref ref-type="bibr" rid="ref21">21</xref>-<xref ref-type="bibr" rid="ref24">24</xref>], few describe development of search filters [<xref ref-type="bibr" rid="ref7">7</xref>,<xref ref-type="bibr" rid="ref9">9</xref>,<xref ref-type="bibr" rid="ref15">15</xref>,<xref ref-type="bibr" rid="ref22">22</xref>,<xref ref-type="bibr" rid="ref23">23</xref>], and fewer yet attempt assessment of search filters by providing what fraction of collected data are relevant [<xref ref-type="bibr" rid="ref9">9</xref>,<xref ref-type="bibr" rid="ref15">15</xref>,<xref ref-type="bibr" rid="ref16">16</xref>,<xref ref-type="bibr" rid="ref22">22</xref>,<xref ref-type="bibr" rid="ref23">23</xref>]. One study provided the probabilities of losing possible relevant tweets by removing certain keywords [<xref ref-type="bibr" rid="ref22">22</xref>] but did not fully assess their search filter.</p>
            <p>Because the quality of social data and the interpretation of subsequent analyses depend on the quality of search filters, it is imperative for social media researchers to provide evidence of the quality and scope of their data: face validity is not sufficient. Computer scientists, communication researchers, and librarians, among others, use precision and recall as measures of search filter quality [<xref ref-type="bibr" rid="ref20">20</xref>,<xref ref-type="bibr" rid="ref25">25</xref>,<xref ref-type="bibr" rid="ref26">26</xref>]. Most studies that use social media data, however, do not attempt to objectively assess the quality of their data. There is often confusion about the meaning of precision and recall because they are used to assess the performance of machine learning classifiers or disease screening tests, which is different from what we aim to assess: the quality of retrieved data. To avoid confusion, we define the precision and recall used to access the quality of retrieved data as the <italic>retrieval precision</italic> and <italic>retrieval recall</italic>. We use the terms precision/recall and retrieval precision/recall interchangeably throughout the paper unless clear distinction is needed.</p>
            <p>In studies that do assess validity, search filters are compared against a gold standard that is typically human coding. No studies so far have considered the fact that human coders can make errors. Some errors associated with coding social media contents are inevitable despite well-trained human coders. An imperfect gold standard may cause bias in the validity assessment [<xref ref-type="bibr" rid="ref27">27</xref>]. While a perfect coding standard may be impractical, it is important that researchers are transparent and consistent about how they report the quality of coding and the strengths and limitations of their benchmark.</p>
            <p>In this paper, we describe a framework for the collection and assessment of social media data. The goal is to move toward a reporting standard that researchers and reviewers can use to compare the quality of data retrieved and analyzed across different studies. For illustrative purposes, we use data collection from Twitter to illustrate concepts that can be adapted for other text-based social media platforms open to public. Further, we use electronic cigarette (e-cigarette) content as a working example of a salient public health topic that is rapidly changing, with constantly emerging new brands and new slang [<xref ref-type="bibr" rid="ref9">9</xref>,<xref ref-type="bibr" rid="ref12">12</xref>] that challenge researchers&#8217; grasp of the language that social media users use to communicate about and market these products.</p>
            <p>Below, we first propose a conceptual framework for social media data collection. Within this framework we describe the development of search filters, illustrate the calculation of retrieval precision and recall, and illustrate common challenges and potential workarounds. Next, we apply our framework to a real-world example using data on e-cigarette content: approximately 4 million tweets retrieved from the Twitter firehose. Finally, we discuss the challenges of applying this rigorous approach to data collection and quality assessment and propose a checklist for reporting data preparation.</p>
        </sec>
        <sec sec-type="methods">
            <title>Methods</title>
            <sec>
                <title>Conceptual Framework for Social Data Collection and Quality Assessment</title>
                <p>We propose a framework that consists of three major steps to develop and validate search filters (see <xref ref-type="table" rid="table1">Table 1</xref>). The proposed framework is designed for users who can access partial or full data streams and can be applied to a human-based process that mainly relies on human judgment and coding, and an automated process supported by machine learning techniques and less human judgment [<xref ref-type="bibr" rid="ref28">28</xref>].</p>
                <table-wrap position="float" id="table1">
                    <label>Table 1</label>
                    <caption>
                        <p>A framework for Twitter data collection and validation.</p>
                    </caption>
                    <table width="687" border="1" cellpadding="7" cellspacing="0" rules="groups" frame="hsides">
                        <col width="73" />
                        <col width="584" />
                        <thead>
                            <tr valign="top">
                                <td>Step</td>
                                <td>Details</td>
                            </tr>
                        </thead>
                        <tbody>
                            <tr valign="top">
                                <td rowspan="2">Develop search filter</td>
                                <td>1. Build a list of search keywords: (a) Generate a list of candidate keywords based on expert knowledge, systematic search of topic-related language, and other resources, (b) Screen the keywords by examining relevance and frequency of posts, (c) Discard keywords that return posts with high proportion of irrelevant contents or relatively low frequency, and (d) Add and screen new keywords when new relevant terms and phrases emerge.</td>
                            </tr>
                            <tr valign="top">
                                <td>2. Integrate keywords with search rules (eg, Boolean operators) for a more focused search.</td>
                            </tr>
                            <tr valign="top">
                                <td>Apply search filter</td>
                                <td>3. The search filter retrieves and splits data into a retrieved set and an unretrieved set.</td>
                            </tr>
                            <tr valign="top">
                                <td rowspan="2">Assess search filter</td>
                                <td>4. Cross-tabulate data by gold standard and search filter: (a) Randomly sample from retrieved and unretrieved data; stratified sampling may be applied, (b) Manually code sampled data to determine relevance in both of retrieved and unretrieved sets, (c) Cross-tabulate sampled data by human-coded relevance (coded relevant vs irrelevant) and search filter retrieval status (retrieved vs unretrieved).</td>
                            </tr>
                            <tr valign="top">
                                <td>5. Compute retrieval precision and retrieval recall.</td>
                            </tr>
                        </tbody>
                    </table>
                </table-wrap>
            </sec>
            <sec>
                <title>Develop Search Filter</title>
                <sec>
                    <title>Build a List of Keywords</title>
                    <p>The first step in developing search filters is keyword selection. Depending on the research topic, keywords should be generated based on expert knowledge and systematic search of topic-related language. It is helpful to brainstorm and categorize keywords into subgroups. In our e-cigarette example, we categorized e-cigarette&#8211;related keywords into three subgroups: devices, brands, and behaviors.</p>
                    <p>Keyword selection also depends on social media platforms from which data are gathered. Twitter data raise unique challenges in keyword selection due to the limited number of characters allowed in a message. Twitter users often shorten messages they post by using hashtags, abbreviations, colloquialisms, and slang terms. For example, the term &#8220;square&#8221; is slang for cigarettes. A researcher without prior knowledge of this term might create a search filter that does not include the term, likely missing out on many tobacco smoking-related contents. It is therefore crucial for researchers to keep up with current abbreviations, colloquial expressions, and slang terms in their research topics. Resources such as urban dictionary [<xref ref-type="bibr" rid="ref29">29</xref>] and a diverse team of researchers are essential to generate and understand such keywords.</p>
                    <p>Despite these efforts, many important terms may still be left out. It is therefore necessary to strategically employ broad search terms rather than highly specific terms/expressions. For example, a tweet like &#8220;A girl sitting next to me smokes squares&#8221; will be captured using a broad term &#8220;smoke&#8221; even if one does not know the term &#8220;square.&#8221; Although using broad search terms like &#8220;smoke&#8221; generates many irrelevant tweets, it reduces the probability of omitting relevant content. This is particularly useful when researchers do not have access to historical archives of social media platforms and are collecting data via streaming.</p>
                    <p>The list of keywords should be further screened and updated iteratively based on relevance and frequency. The keywords that return relatively few tweets (eg, &#60;10 over a month) or that return a small proportion of relevant tweets (eg, &#60;30% precision) may be discarded. That is, the signal (relevant data) to noise (irrelevant data) ratio should be considered [<xref ref-type="bibr" rid="ref22">22</xref>] and proper thresholds may depend on research questions. New keywords should be added to the list when new relevant terms and phrases emerge (eg, new e-cigarette brands, frequent co-occurring terms). Repeating Steps 1-4 of <italic>Build a list of search keywords</italic> in <xref ref-type="table" rid="table1">Table 1</xref> improves the quality of keywords and provides a good understanding of how social media users talk about a specific topic. If the data are collected for surveillance or forecasting, keywords should be updated periodically and related media coverage (if any) should be accounted.</p>
                </sec>
                <sec>
                    <title>Integrate Keywords With Search Rules</title>
                    <p>A search filter is a combination of keywords and search rules. Integrating keywords with search rules greatly improves the ability of search filters to retrieve relevant messages. Search rules can be used to weed out irrelevant messages retrieved by broad terms. For example, in tobacco research, irrelevant tweets can be excluded by specifying that terms such as &#8220;barbeque&#8221; or &#8220;marijuana&#8221; do not appear in the tweets, while relevant tweets could be kept if a tweet contains both terms &#8220;smoke&#8221; and &#8220;square.&#8221; These search rules can be constructed using the Boolean operators (AND, OR, NOT) and data pre-processing techniques such as n-grams or proximity operator.</p>
                </sec>
            </sec>
            <sec>
                <title>Apply Search Filter</title>
                <p>
                    <xref ref-type="fig" rid="figure1">Figure 1</xref> displays a structure of data archive, search filter, and relevant tweets in the Twitterverse. The archive contains data returned by broad search terms (the blue circle with dotted line indicates the archive, and the red rectangle indicates all tweets relevant to a specific topic). The search filter returns &#8220;a + b&#8221; tweets. The archive may omit a small fraction of topic-relevant tweets &#8220;e&#8221; due to unknown terms, misspellings, etc.</p>
                <fig id="figure1" position="float">
                    <label>Figure 1</label>
                    <caption>
                        <p>The archive (a+b+c+d), retrieved tweets (a+b), and relevant tweets (a+c+e) in Twitterverse.</p>
                    </caption>
                    <graphic xlink:href="jmir_v18i2e41_fig1.png" alt-version="no" mimetype="image" position="float" xlink:type="simple" />
                </fig>
            </sec>
            <sec>
                <title>Assess Search Filter</title>
                <sec>
                    <title>Quality Measures: Definition</title>
                    <p>Any search filter should be validated based on its ability to distinguish relevant and irrelevant messages. Two criteria are typically used: <italic>retrieval recall</italic> and <italic>retrieval precision</italic> [<xref ref-type="bibr" rid="ref25">25</xref>]. Precision measures how much of the retrieved data is not garbage. Recall measures how much of the relevant data is retrieved.</p>
                    <p>
                        <xref ref-type="table" rid="table2">Table 2</xref> is constructed to evaluate a search filter against human coding. Precision is a conditional probability that a particular post is relevant, given that it is retrieved, calculated by a/(a + b). Recall is a conditional probability that a particular post is retrieved given that it is relevant, calculated by a/(a + c). Precision is also called positive predictive value, and recall is often called sensitivity of search filter [<xref ref-type="bibr" rid="ref30">30</xref>]. There is trade-off: high recall may be achieved at the expense of low precision (or low specificity), and vice versa. The F-score is used to report a single measure combining precision and recall [<xref ref-type="bibr" rid="ref31">31</xref>], computed by:</p>
                    <p>F=(1 + &#946;<sup>2</sup>)(precision)(recall)/(&#946;<sup>2</sup> precision + recall)&#160;(1)</p>
                    <p>Often &#946;=1 is used and such measurement is called an F1 score. It can be shown that, using the Bayes&#8217; theorem [<xref ref-type="bibr" rid="ref32">32</xref>], the recall can be computed by:</p>
                    <p>Recall=(precision)P(retr)/[(precision)P(retr) + P(relevant&#124;unretr)(1 &#8210; P(retr))]&#160;(2)</p>
                    <p>P(retr) denotes the proportion of tweets retrieved, and P(relevant&#124;unretr) denotes the proportion of unretrieved tweets found to be relevant.</p>
                    <p>Beyond precision and recall, specificity and negative predictive value (NPV) may be used. Specificity measures how much of the irrelevant tweets is discarded, defined by d/(b + d), and is closely related to precision. NPV is the proportion of unretrieved tweets found to be irrelevant, defined by d/(c + d). Note that P(relevant&#124;unretr)=1&#8210;NPV. The proportion of relevant tweets may be obtained by (a + c)/n assuming that the data represent a random sample of the population and human coding is not subject to errors.</p>
                    <table-wrap position="float" id="table2">
                        <label>Table 2</label>
                        <caption>
                            <p>Assessment of search filter with human coding as a gold standard.</p>
                        </caption>
                        <table width="687" border="1" cellpadding="7" cellspacing="0" rules="groups" frame="hsides">
                            <col width="134" />
                            <col width="181" />
                            <col width="211" />
                            <col width="103" />
                            <thead>
                                <tr valign="top">
                                    <td rowspan="2">Search filter</td>
                                    <td colspan="2">Human coding</td>
                                    <td rowspan="2">Total</td>
                                </tr>
                                <tr valign="top">
                                    <td>Coded relevant</td>
                                    <td>Coded not-relevant</td>
                                </tr>
                            </thead>
                            <tbody>
                                <tr valign="top">
                                    <td>Retrieved</td>
                                    <td>a (True Positive)</td>
                                    <td>b (False Positive)</td>
                                    <td>a + b=n<sub>1</sub>
                                    </td>
                                </tr>
                                <tr valign="top">
                                    <td>Not retrieved</td>
                                    <td>c (False Negative)</td>
                                    <td>d (True Negative)</td>
                                    <td>c + d=n<sub>2</sub>
                                    </td>
                                </tr>
                                <tr valign="top">
                                    <td>Total</td>
                                    <td>a + c</td>
                                    <td>b + d</td>
                                    <td>n</td>
                                </tr>
                            </tbody>
                        </table>
                    </table-wrap>
                </sec>
                <sec>
                    <title>Sampling Plan for Human Coding</title>
                    <p>Calculation of retrieval precision and recall depends on the assessment of relevant versus irrelevant content. Typically, trained coders inspect a sample of retrieved data to manually evaluate relevancy as well as a sample of unretrieved data. This poses two important questions: how to sample and how many messages to sample. A practical sample size should be determined because it is labor intensive and time consuming to manually code millions of messages, and the estimates of precision and recall should be precise.</p>
                    <p>We suggest stratified sampling with retrieval status as strata and oversampling the retrieved messages. This is because typically the size of retrieved messages is small relative to unretrieved messages (n<sub>1</sub>/n<sub>2</sub>&#60;0.1), and oversampling the retrieved messages ensures a desired level of statistical precision. Retrieval recall is more difficult to accurately estimate than retrieval precision because estimating <italic>c</italic> is often similar to finding a needle in a massive haystack of unretrieved messages. Therefore the statistical precision of recall estimate is affected by the sample size. <xref ref-type="fig" rid="figure2">Figure 2</xref> displays how the average length of confidence intervals for retrieval recall estimates decreases as the sample size of unretrieved messages (=k) increases, while the sample size of retrieved message is fixed. The gain in statistical precision diminishes as the number of unretrieved messages increases, and the gain is minimal above a certain sample size. By conducting a simulation or using power analysis tool, a sample size that satisfies the desired level of statistical precision and feasibility can be determined. <xref ref-type="app" rid="app1">Multimedia Appendix 1</xref> describes how <xref ref-type="fig" rid="figure2">Figure 2</xref> was generated and discusses more about sample sizes.</p>
                    <fig id="figure2" position="float">
                        <label>Figure 2</label>
                        <caption>
                            <p>The average limits of 95% confidence intervals for recall (vertical axis) as the sample size of unretrieved messages increases (horizontal axis), fixing the sample size of retrieved data at 3000.</p>
                        </caption>
                        <graphic xlink:href="jmir_v18i2e41_fig2.png" alt-version="no" mimetype="image" position="float" xlink:type="simple" />
                    </fig>
                </sec>
                <sec>
                    <title>Estimation of Retrieval Precision and Retrieval Recall</title>
                    <p>Calculating retrieval precision and recall is straightforward when (1) human coding performs well as a gold standard and (2) <xref ref-type="table" rid="table2">Table 2</xref> is complete. We discuss in detail the cases in which one or both conditions are not satisfied and how to address them.</p>
                </sec>
                <sec>
                    <title>Assuming Human Coding Has No Error</title>
                    <sec>
                        <title>Ideal Conditions</title>
                        <p>The definitions of precision and recall are directly used when the two conditions are met. If stratified disproportionate sampling is used, appropriate weights should be applied to calculate recall. Confidence intervals can be estimated based on usual asymptotic methods [<xref ref-type="bibr" rid="ref33">33</xref>]. If Equation (2) is used to calculate recall, the interval estimate should account for variances of precision and P(relevant&#124;unretr).</p>
                    </sec>
                    <sec>
                        <title>Unretrieved Messages Could Not Be Archived</title>
                        <p>Messages matching search filters may be retrieved directly from a data provider so that only the retrieved messages are archived [<xref ref-type="bibr" rid="ref11">11</xref>,<xref ref-type="bibr" rid="ref15">15</xref>,<xref ref-type="bibr" rid="ref21">21</xref>]. Search filter precision can be estimated, but how do we estimate the recall without knowing <italic>c</italic> and <italic>d</italic>? In this case, the unretrieved total n<sub>2</sub> may be known approximately. Joseph et al used the Bayesian model to estimate recall and specificity when only n<sub>1</sub> and n<sub>2</sub> were given [<xref ref-type="bibr" rid="ref34">34</xref>]. Bayesian models often provide a feasible solution when insufficient information is contained in data to apply usual methods. Since <italic>a</italic> (thus <italic>b</italic>) can be observed in addition to n<sub>1</sub> and n<sub>2</sub>, we slightly modify their method.</p>
                        <p>Let <italic>&#960;</italic> be the prevalence of relevant messages, <italic>S</italic> be recall, and <italic>C</italic> be specificity of search filter. The counts of tweets (<italic>a</italic>, <italic>b</italic>, <italic>c</italic>, <italic>d</italic>) in <xref ref-type="table" rid="table2">Table 2</xref> have multinomial distribution with respective probabilities forming the likelihood function. Beta prior distributions for <italic>&#960;, S,</italic> and <italic>C</italic> seem sensible because its domain of positive density is bounded in (0,1). Let Beta(&#945;<sub>
                                <italic>&#960;</italic>
                            </sub>, &#946;<sub>
                                <italic>&#960;</italic>
                            </sub>), Beta(&#945;<sub>
                                <italic>S</italic>
                            </sub>, &#946;<sub>
                                <italic>S</italic>
                            </sub>), and Beta(&#945;<sub>
                                <italic>C</italic>
                            </sub>, &#946;<sub>
                                <italic>C</italic>
                            </sub>) denote the prior distributions of <italic>&#960;, S,</italic> and <italic>C</italic> respectively, where Beta(&#945;, &#946;) is beta density function with parameters &#945; and &#946;. Full conditional posterior distributions can be derived for all unknown quantities including <italic>c</italic>, and realized values are sampled from the posterior distributions using a Gibbs sampler. A Gibbs sampler draws from each full conditional posterior distribution sequentially, conditional on all other sampled quantities [<xref ref-type="bibr" rid="ref32">32</xref>]. It can be shown that the prevalence of relevant messages and recall of search filter have the following posterior distributions: <italic>&#960;</italic> ~ Beta (a + c + &#945;<sub>
                                <italic>&#960;</italic>
                            </sub>, n &#8210; a &#8210; c <italic>+</italic> &#946;<sub>
                                <italic>&#960;</italic>
                            </sub>), S ~ Beta (a + &#945;<sub>
                                <italic>S</italic>
                            </sub> , c + &#946;<sub>
                                <italic>S</italic>
                            </sub>).</p>
                        <p>The quantity <italic>c</italic> is obtained in a previous sampling step. The Bayesian credible interval for an unknown quantity can be obtained based on the random draws from posterior distributions. The Gibbs sampling steps for all unknown quantities are provided in <xref ref-type="app" rid="app2">Multimedia Appendix 2</xref>.</p>
                    </sec>
                </sec>
                <sec>
                    <title>Assuming Human Coding Is Subject to Error</title>
                    <sec>
                        <title>Human Coding Is a Silver Standard</title>
                        <p>Evaluating search filters using imperfect human coding gives a biased impression of data quality. Recall and precision of a search filter depend on recall and specificity of the gold standard [<xref ref-type="bibr" rid="ref27">27</xref>]. Staquet et al considered a situation where a gold standard has 100% specificity and unknown recall. It may be relatively unlikely that a trained coder evaluates a given irrelevant tweet as relevant. For example, a coder likely will not determine the message &#8220;Come get a <italic>smoking</italic> hot jerk chicken wrap from us&#8221; as relevant to tobacco smoking. Thus it may be safe to assume, for a given topic, that the specificity of human coding is (close to) 100%. When this assumption is met, the search filter&#8217;s recall is unbiased and the bias-corrected equation for precision is given by precision=a/[S<sub>2</sub> (a + b)], where S<sub>2</sub> denotes the recall of human coding. Therefore, when human coding does not have perfect recall (false negative error), the method assuming the ideal conditions underestimates search filter precision.</p>
                    </sec>
                    <sec>
                        <title>Human Coding Is Not a Standard Classifier</title>
                        <p>Although in many cases human coding serves as a gold/silver standard, it may be an inadequate standard classifier for some topics because human language can be ambiguous (eg, &#8220;Leo DiCap is smoking&#8221;). Language used on Twitter is often colloquial and creative, and it may be difficult (or impossible) to interpret meaning within 140 characters without looking at related conversations (eg, &#8220;I can&#8217;t tell if that&#8217;s a chocolate Dutch my&#8221;; this was a reply to a tweet about Dutch chocolate-flavored cigarillo). Also, coders simply get tired. As a result, coders may falsely determine irrelevant posts to be relevant or vice versa (false positive and false negative error). Joseph et al extended the Bayesian model to the situation where results of two filters, neither of which was a gold standard, were available [<xref ref-type="bibr" rid="ref34">34</xref>]. We again modify their method to estimate search filter precision and recall.</p>
                        <p>Similar to <xref ref-type="table" rid="table2">Table 2</xref>, search filter and human coding results are cross-tabulated. Each cell can be split into truly relevant versus irrelevant contents (see <xref ref-type="table" rid="table3">Table 3</xref>). Let y<sub>1</sub> be the count of relevant messages out of the <italic>a</italic> messages retrieved by search filter and human-coded relevant; the count of irrelevant messages is a &#8210; y<sub>1</sub>. The rest of the cells can be similarly split.</p>
                        <table-wrap position="float" id="table3">
                            <label>Table 3</label>
                            <caption>
                                <p>Multinomial likelihood contributions of all possible cases of observed data and unknown quantities (the unknown quantities of truly relevant tweets are denoted by y<sub>1</sub>, y<sub>2</sub>, y<sub>3</sub>, y<sub>4</sub>).</p>
                            </caption>
                            <table width="687" border="1" cellpadding="7" cellspacing="0" rules="groups" frame="hsides">
                                <col width="204" />
                                <col width="140" />
                                <col width="140" />
                                <thead>
                                    <tr valign="top">
                                        <td rowspan="2">Search filter (<italic>j</italic>=1)</td>
                                        <td colspan="2">Human coding (<italic>j</italic>=2)</td>
                                    </tr>
                                    <tr valign="top">
                                        <td>Coded relevant</td>
                                        <td>Coded not-relevant</td>
                                    </tr>
                                </thead>
                                <tbody>
                                    <tr valign="top">
                                        <td rowspan="2">Retrieved</td>
                                        <td>a &#8722; y<sub>1</sub>
                                        </td>
                                        <td>b &#8722; y<sub>2</sub>
                                        </td>
                                    </tr>
                                    <tr valign="top">
                                        <td>y<sub>1</sub>
                                        </td>
                                        <td>y<sub>2</sub>
                                        </td>
                                    </tr>
                                    <tr valign="top">
                                        <td rowspan="2">Not retrieved</td>
                                        <td>y<sub>3</sub>
                                        </td>
                                        <td>y<sub>4</sub>
                                        </td>
                                    </tr>
                                    <tr valign="top">
                                        <td>c &#8211; y<sub>3</sub>
                                        </td>
                                        <td>d &#8722; y<sub>4</sub>
                                        </td>
                                    </tr>
                                </tbody>
                            </table>
                        </table-wrap>
                        <p>Let <italic>&#960;</italic> be the prevalence of relevant messages, S<sub>1</sub> and C<sub>1</sub> be recall and specificity of search filter, and S<sub>2</sub> and C<sub>2</sub> be recall and specificity of human coding. The eight cells in <xref ref-type="table" rid="table3">Table 3</xref> can be expressed as occurrences of multinomial events with probabilities that are functions of the five parameters. Again, a beta distribution can be used to set up prior distribution of each parameter. Denote S<sub>1</sub>, S<sub>2</sub>, C<sub>1</sub>, and C<sub>2</sub> are distributed Beta(&#945;<sub>S1</sub>, &#946;<sub>S1</sub>), Beta(&#945;<sub>S2</sub>, &#946;<sub>S2</sub>), Beta(&#945;<sub>C1</sub>, &#946;<sub>C1</sub>), and Beta(&#945;<sub>C2</sub>, &#946;<sub>C2</sub>), respectively. It can be shown that the prevalence of relevant messages and search filter recall and specificity have the following posterior distributions:</p>
                        <p>
                            <italic>&#960;</italic> ~ Beta (&#8721;y<sub>
                                <italic>i</italic>
                            </sub> + &#945;<sub>
                                <italic>&#960;</italic>
                            </sub> , n &#8210; &#8721;y<sub>
                                <italic>i</italic>
                            </sub> + &#946;<sub>
                                <italic>&#960;</italic>
                            </sub>) for <italic>i</italic>=1,2,3,4</p>
                        <p>S<sub>1</sub>~ Beta (y<sub>1</sub>+ y<sub>2</sub>+ &#945;<sub>S1</sub>, y<sub>3</sub>+ y<sub>4</sub>+ &#946;<sub>S1</sub>)</p>
                        <p>C<sub>1</sub>~ Beta (c + d &#8210; y<sub>3</sub>&#8210; y<sub>4</sub>+ &#945;<sub>C1</sub>, a + b &#8210; y<sub>1</sub>&#8210; y<sub>2</sub>+ &#946;<sub>C1</sub>)</p>
                        <p>The precision and NPV of search filter can be obtained by the equations:</p>
                        <p>Precision<sub>1</sub>=S<sub>1</sub>
                            <italic>&#960;</italic>/[S<sub>1</sub>
                            <italic>&#960;</italic> + (1 &#8210; C<sub>1</sub>)(1 &#8210; <italic>&#960;</italic>)]</p>
                        <p>NPV<sub>1</sub>=C<sub>1</sub>(1 &#8210; <italic>&#960;</italic>)/[C<sub>1</sub>(1 &#8210; <italic>&#960;</italic>) + (1 &#8210; S<sub>1</sub>) <italic>&#960;</italic>]</p>
                        <p>These are based on the random draws from posterior distributions of <italic>&#960;</italic>, S<sub>1</sub>, and C<sub>1</sub>. <xref ref-type="app" rid="app3">Multimedia Appendix 3</xref> describes the Gibbs sampling steps to obtain random draws from posterior distributions of all unknown quantities including precision and recall of human coding.</p>
                    </sec>
                </sec>
            </sec>
        </sec>
        <sec sec-type="results">
            <title>Results</title>
            <sec>
                <title>Develop Search Filter</title>
                <p>We obtained Twitter data via an API called Firehose from Gnip, Inc., licensed to provide access to the full stream and historic archive of Twitter data. Access to Firehose is not free as opposed to publicly available data streams such as Streaming API. The Twitter Firehose returned 3,954,575 unique tweets that matched broad keywords about tobacco smoking in October 2012, forming an archive. The archive provided a base to construct <xref ref-type="table" rid="table2">Table 2</xref>.</p>
                <p>We developed a search filter to retrieve e-cigarette-related contents, building around three categories of e-cigarette-related tweets: alternative terms and device parts of e-cigarettes, brand names, and related behavior. We tested keywords using the Twitter Search Engine [<xref ref-type="bibr" rid="ref35">35</xref>] without logging into our Twitter accounts to avoid search bias. We screened and discarded keywords that returned irrelevant tweets higher than 70% of the time or that returned &#60;10 tweets over a month. When unknown but seemingly relevant terms and phrases that co-occur with our keywords emerged, we checked them in an urban dictionary and other social media platforms, added them to the list, and screened them on Twitter Search. We repeated Steps 1-4 from <xref ref-type="table" rid="table1">Table 1</xref> until no more seemingly important keywords were found.</p>
                <p>The resulting keyword list included singular and plural forms of e-cigarette terms, different verb forms of behavior terms, and frequent misspellings. We filtered out tweets containing the keywords &#8220;atomizer&#8221; AND &#8220;perfume&#8221; as those were likely to describe perfume bottles. Those tweeted by or mentioning @blucigs, an e-cigarette promoting account, were collected. The final list of keywords and rules is presented in <xref ref-type="app" rid="app4">Multimedia Appendix 4</xref>.</p>
            </sec>
            <sec>
                <title>Assess e-Cigarette Search Filter</title>
                <sec>
                    <title>Sampling Plan for Human Coding</title>
                    <p>We conducted stratified sampling with retrieval status as strata. A small simulation was performed to determine sample size in each stratum. Data were generated assuming that N was 4 million, retrieval precision was 95%, and retrieval recall was 84%. The simulation details are described in <xref ref-type="app" rid="app1">Multimedia Appendix 1</xref> (Case 1). Based on the simulation, we determined that random sampling above 4000 from retrieved tweets and above 6000 from unretrieved tweets would be sufficient.</p>
                </sec>
                <sec>
                    <title>Assuming Human Coding Has No Error</title>
                    <sec>
                        <title>Ideal Conditions</title>
                        <p>The e-cigarette search filter retrieved 82,205 tweets from the archive, yielding P(retr)=0.0208. We randomly sampled 4373 from the retrieved set and coded 4176 of those as relevant, resulting in 95.5% retrieval precision (95% CI 94.9-96.1). <xref ref-type="table" rid="table4">Table 4</xref> represents number of tweets cross-tabulated by human coding and search filter; the amount of retrieved tweets was adjusted for the disproportionate sampling fraction. Out of 6305 randomly sampled unretrieved tweets, 20 were found relevant, yielding P(relevant&#124;unretr)=0.0032. The retrieval recall was 86.37% (95% CI 81.4-91.9) by Equation (2). The F1 score was 90.7%.</p>
                        <table-wrap position="float" id="table4">
                            <label>Table 4</label>
                            <caption>
                                <p>Search filter versus human coding on sampled data adjusted for sampling fraction.</p>
                            </caption>
                            <table width="687" border="1" cellpadding="7" cellspacing="0" rules="groups" frame="hsides">
                                <col width="154" />
                                <col width="182" />
                                <col width="227" />
                                <col width="66" />
                                <thead>
                                    <tr valign="top">
                                        <td rowspan="2">Search filter</td>
                                        <td colspan="2">Human coding</td>
                                        <td rowspan="2">Total</td>
                                    </tr>
                                    <tr valign="top">
                                        <td>Coded relevant</td>
                                        <td>Coded not-relevant</td>
                                    </tr>
                                </thead>
                                <tbody>
                                    <tr valign="top">
                                        <td>Retrieved</td>
                                        <td>128</td>
                                        <td>6</td>
                                        <td>134</td>
                                    </tr>
                                    <tr valign="top">
                                        <td>Not retrieved</td>
                                        <td>20</td>
                                        <td>6285</td>
                                        <td>6305</td>
                                    </tr>
                                    <tr valign="top">
                                        <td>Total</td>
                                        <td>148</td>
                                        <td>6291</td>
                                        <td>6439</td>
                                    </tr>
                                </tbody>
                            </table>
                        </table-wrap>
                    </sec>
                    <sec>
                        <title>Unretrieved Messages Could Not Be Archived</title>
                        <p>To demonstrate the method, we assumed that the archive contained only the tweets retrieved by the e-cigarette search filter. After assigning initial values (<xref ref-type="app" rid="app2">Multimedia Appendix 2</xref>), a value of precision was sampled from the uniform distribution with limits equal to the 95% confidence interval of the precision (94.9-96.1). We used n<sub>1</sub>=82,205 and n<sub>2</sub>=3,872,370 in the subsequent steps. The Gibbs sampler was repeated 100,000 cycles, and the first 10,000 cycles were discarded as burn-in. The prior distribution and posterior inference results are presented in <xref ref-type="table" rid="table5">Table 5</xref>. Prevalence indicates the proportion of e-cigarette&#8211;relevant tweets within the archive. Prior distributions have been set based on our experience: the specificity is usually high due to low prevalence, and we are confident that the search filter captures the majority of e-cigarette tweets. Although rather high uncertainty was reflected in the prior density of recall&#8212;as low as 34%. The F1 score values are computed applying the sampled values of recall and precision on Equation (1) at the end of each cycle. The posterior mean of retrieval recall is 75%: between 50% and 98% with 95% probability. Having no information on the amount of false negative tweets caused a wider interval.</p>
                        <table-wrap position="float" id="table5">
                            <label>Table 5</label>
                            <caption>
                                <p>Prior and posterior means and 95% credible intervals when unretrieved messages cannot be archived.</p>
                            </caption>
                            <table width="687" border="1" cellpadding="7" cellspacing="0" rules="groups" frame="hsides">
                                <col width="10" />
                                <col width="119" />
                                <col width="70" />
                                <col width="172" />
                                <col width="70" />
                                <col width="160" />
                                <thead>
                                    <tr valign="top">
                                        <td rowspan="2" colspan="2">
                                            <break />
                                        </td>
                                        <td colspan="2">Beta prior distribution</td>
                                        <td colspan="2">Posterior distribution</td>
                                    </tr>
                                    <tr valign="top">
                                        <td>Mean</td>
                                        <td>95% HD<sup>a</sup>
                                        </td>
                                        <td>Mean</td>
                                        <td>95% HPD<sup>b</sup>
                                        </td>
                                    </tr>
                                </thead>
                                <tbody>
                                    <tr valign="top">
                                        <td colspan="2">Prevalence</td>
                                        <td>0.010</td>
                                        <td>1&#215;10<sup>&#8210;6</sup>-0.031</td>
                                        <td>0.028</td>
                                        <td>0.020-0.038</td>
                                    </tr>
                                    <tr valign="top">
                                        <td colspan="6">
                                            <bold>Search filter</bold>
                                        </td>
                                    </tr>
                                    <tr valign="top">
                                        <td>
                                            <break />
                                        </td>
                                        <td>Recall</td>
                                        <td>0.667</td>
                                        <td>0.340-0.954</td>
                                        <td>0.752</td>
                                        <td>0.505-0.979</td>
                                    </tr>
                                    <tr valign="top">
                                        <td>
                                            <break />
                                        </td>
                                        <td>Precision<sup>c</sup>
                                        </td>
                                        <td>&#8211;</td>
                                        <td>&#8211;</td>
                                        <td>0.955</td>
                                        <td>0.949-0.961</td>
                                    </tr>
                                    <tr valign="top">
                                        <td>
                                            <break />
                                        </td>
                                        <td>Specificity</td>
                                        <td>0.733</td>
                                        <td>0.474-0.962</td>
                                        <td>0.999</td>
                                        <td>0.999-0.999</td>
                                    </tr>
                                    <tr valign="top">
                                        <td>
                                            <break />
                                        </td>
                                        <td>F1 score<sup>c</sup>
                                        </td>
                                        <td>&#8211;</td>
                                        <td>&#8211;</td>
                                        <td>0.835</td>
                                        <td>0.663-0.968</td>
                                    </tr>
                                </tbody>
                            </table>
                            <table-wrap-foot>
                                <fn id="table5fn1">
                                    <p>
                                        <sup>a</sup>HD: highest density interval.</p>
                                </fn>
                                <fn id="table5fn2">
                                    <p>
                                        <sup>b</sup>HPD: highest posterior density interval. HPD interval gives narrower length than equal-tailed intervals for skewed distribution (computed using R Package BOA [<xref ref-type="bibr" rid="ref36">36</xref>]).</p>
                                </fn>
                                <fn id="table5fn3">
                                    <p>
                                        <sup>c</sup>Prior density functions of precision and F1 score are not specified but determined as a function of other parameters.</p>
                                </fn>
                            </table-wrap-foot>
                        </table-wrap>
                    </sec>
                </sec>
                <sec>
                    <title>Assuming Human Coding Is Subject to Error</title>
                    <sec>
                        <title>Human Coding Is a Silver Standard</title>
                        <p>We assumed that the coders could accurately evaluate irrelevant contents with 100% specificity although they might falsely determine relevant contents to be irrelevant (&#60;100% recall). When human coders make false negative errors, the method assuming the ideal conditions underestimates retrieval precision of search filter. The bias-corrected equation gave the precision of 95.7%, indicating that precision determined assuming the two conditions was minimally biased.</p>
                    </sec>
                    <sec>
                        <title>Human Coding Is Not a Standard Classifier</title>
                        <p>Finally we assumed that coders could falsely determine irrelevant contents to be relevant and vice versa (&#60;100% recall and &#60;100% specificity). Each cell of <xref ref-type="table" rid="table4">Table 4</xref> can be split into truly relevant and irrelevant tweets. Again let y<sub>1</sub> be the count of relevant tweets among those retrieved by search filter and human-coded relevant; the count of irrelevant tweets is 128 &#8210; y<sub>1</sub>. The Gibbs sampler (see <xref ref-type="app" rid="app3">Multimedia Appendix 3</xref>) was repeated 100,000 cycles, and the first 10,000 cycles were discarded as burn-in. The prior distribution and posterior inference results are presented in <xref ref-type="table" rid="table6">Table 6</xref>. Our belief that human coding is slightly better than the search filter is reflected in the prior distributions. The posterior mean of prevalence of e-cigarette tweets is 2% in the archive. The posterior mean of retrieval recall is 93% for the search filter and 96% for human coding. Having more information resulted in smaller uncertainty (ie, shorter HPD intervals).</p>
                        <table-wrap position="float" id="table6">
                            <label>Table 6</label>
                            <caption>
                                <p>Prior and posterior means and 95% credible intervals when human coding is not a standard classifier.</p>
                            </caption>
                            <table width="687" border="1" cellpadding="7" cellspacing="0" rules="groups" frame="hsides">
                                <col width="10" />
                                <col width="119" />
                                <col width="70" />
                                <col width="172" />
                                <col width="70" />
                                <col width="160" />
                                <thead>
                                    <tr valign="top">
                                        <td rowspan="2" colspan="2">
                                            <break />
                                        </td>
                                        <td colspan="2">Beta prior distribution</td>
                                        <td colspan="2">Posterior distribution</td>
                                    </tr>
                                    <tr valign="top">
                                        <td>Mean</td>
                                        <td>95% HD<sup>a</sup>
                                        </td>
                                        <td>Mean</td>
                                        <td>95% HPD<sup>b</sup>
                                        </td>
                                    </tr>
                                </thead>
                                <tbody>
                                    <tr valign="top">
                                        <td colspan="2">Prevalence</td>
                                        <td>0.019</td>
                                        <td>1&#215;10<sup>&#8210;6</sup>-0.031</td>
                                        <td>0.021</td>
                                        <td>0.018-0.025</td>
                                    </tr>
                                    <tr valign="top">
                                        <td colspan="6">
                                            <bold>Search filter</bold>
                                        </td>
                                    </tr>
                                    <tr valign="top">
                                        <td>
                                            <break />
                                        </td>
                                        <td>Recall</td>
                                        <td>0.667</td>
                                        <td>0.340-0.954</td>
                                        <td>0.929</td>
                                        <td>0.862-0.992</td>
                                    </tr>
                                    <tr valign="top">
                                        <td>
                                            <break />
                                        </td>
                                        <td>Precision<sup>c</sup>
                                        </td>
                                        <td>&#8211;</td>
                                        <td>&#8211;</td>
                                        <td>0.956</td>
                                        <td>0.914-0.994</td>
                                    </tr>
                                    <tr valign="top">
                                        <td>
                                            <break />
                                        </td>
                                        <td>Specificity</td>
                                        <td>0.733</td>
                                        <td>0.474-0.962</td>
                                        <td>0.999</td>
                                        <td>0.998-1.000</td>
                                    </tr>
                                    <tr valign="top">
                                        <td>
                                            <break />
                                        </td>
                                        <td>F1 score<sup>c</sup>
                                        </td>
                                        <td>&#8211;</td>
                                        <td>&#8211;</td>
                                        <td>0.942</td>
                                        <td>0.901-0.982</td>
                                    </tr>
                                    <tr valign="top">
                                        <td colspan="6">
                                            <bold>Human coding</bold>
                                        </td>
                                    </tr>
                                    <tr valign="top">
                                        <td>
                                            <break />
                                        </td>
                                        <td>Recall</td>
                                        <td>0.733</td>
                                        <td>0.474-0.962</td>
                                        <td>0.961</td>
                                        <td>0.923-0.995</td>
                                    </tr>
                                    <tr valign="top">
                                        <td>
                                            <break />
                                        </td>
                                        <td>Precision<sup>c</sup>
                                        </td>
                                        <td>&#8211;</td>
                                        <td>&#8211;</td>
                                        <td>0.897</td>
                                        <td>0.824-0.971</td>
                                    </tr>
                                    <tr valign="top">
                                        <td>
                                            <break />
                                        </td>
                                        <td>Specificity</td>
                                        <td>0.800</td>
                                        <td>0.616-0.975</td>
                                        <td>0.998</td>
                                        <td>0.996-0.999</td>
                                    </tr>
                                    <tr valign="top">
                                        <td>
                                            <break />
                                        </td>
                                        <td>F1 score<sup>c</sup>
                                        </td>
                                        <td>&#8211;</td>
                                        <td>&#8211;</td>
                                        <td>0.927</td>
                                        <td>0.883-0.971</td>
                                    </tr>
                                </tbody>
                            </table>
                            <table-wrap-foot>
                                <fn id="table6fn1">
                                    <p>
                                        <sup>a</sup>HD: highest density interval.</p>
                                </fn>
                                <fn id="table6fn2">
                                    <p>
                                        <sup>b</sup>HPD: highest posterior density interval. HPD interval gives narrower length than equal-tailed intervals for skewed density (computed using R package BOA [<xref ref-type="bibr" rid="ref36">36</xref>]).</p>
                                </fn>
                                <fn id="table6fn3">
                                    <p>
                                        <sup>c</sup>Prior density of precision is not specified but implied as a function of other parameters.</p>
                                </fn>
                            </table-wrap-foot>
                        </table-wrap>
                    </sec>
                </sec>
            </sec>
        </sec>
        <sec sec-type="discussion">
            <title>Discussion</title>
            <sec>
                <title>Principal Findings</title>
                <p>While traditional survey data can take years to collect, social media data offer insights into health behavior and public sentiment around health-related topics in a much shorter time frame. They enable researchers to conduct qualitative studies previously only available via focus groups on a large scale. However, a large quantity of data does not assure valid and reliable results. In fact, biases may scale up with the quantity. For example, surveillance systems based on poor data may greatly overpredict or underpredict disease prevalence [<xref ref-type="bibr" rid="ref37">37</xref>,<xref ref-type="bibr" rid="ref38">38</xref>]. Without proper search filters, the quality of inferences from social media data will be at best poor, regardless of analytical techniques. Proper filtering and quality assessment are crucial for research with social media data.</p>
                <p>Building a search filter is rarely a one-step process, but rather requires significant effort [<xref ref-type="bibr" rid="ref22">22</xref>]. It is an iterative progression of refining search keywords and rules that capture relevant social data which satisfy pre-specified thresholds for precision and signal to noise ratio. We developed the e-cigarette search filter by monitoring frequency and precision for each keyword. The search filter was refined until no more important new terms were discovered. The keywords were combined with search rules to increase retrieval precision. Wang et al has proposed a method to automatically update the list of keywords by adding the top frequent terms that appear among relevant tweets [<xref ref-type="bibr" rid="ref28">28</xref>]. We are working toward semi-automating our iterative process by incorporating their method.</p>
                <p>We quantified search filter quality by computing retrieval precision and recall in four different cases. Retrieval precision was estimated above 95% in all cases. Retrieval recall was estimated at 86% assuming ideal conditions, 75% when unretrieved messages could not be archived, 86% assuming no false negative errors by coders, and 93% assuming that human coders make both false negative and false positive errors. Researchers should determine which condition is appropriate according to their expert knowledge and experience about the topics and search filters. Regardless of which approach is chosen, the rationale and approach should be clearly reported in any presentation of the data and analyses.</p>
                <p>The e-cigarette search filter (see <xref ref-type="app" rid="app4">Multimedia Appendix 4</xref>) was developed in 2012. Since that time, e-cigarette popularity has increased significantly [<xref ref-type="bibr" rid="ref39">39</xref>,<xref ref-type="bibr" rid="ref40">40</xref>], many new brands and various types of vaping devices have entered the market, and e-cigarette-related language and slang terms have evolved. If we were to use the same search filter to study what people say about e-cigarettes on social media in 2015, the retrieval precision and recall would be poor. This underscores the importance of reporting the search filters used, along with their retrieval precision and recall at the time of data collection. When tracking trends of behaviors, attitudes, and beliefs over time, it is crucial to maintain an updated list of keywords/search filters for the given topic.</p>
            </sec>
            <sec>
                <title>Filtering Using Machine Classifiers</title>
                <p>Machine learning classifiers are often used for content analysis but also can be used to remove irrelevant messages from the data retrieved by search filters [<xref ref-type="bibr" rid="ref9">9</xref>,<xref ref-type="bibr" rid="ref22">22</xref>]. A well-developed classifier can reduce human labor. The accuracy of the classifier should be validated on a hold-out sample by computing precision and recall of the classifier. We refer the validation of classifiers to machine learning literature [<xref ref-type="bibr" rid="ref31">31</xref>,<xref ref-type="bibr" rid="ref41">41</xref>,<xref ref-type="bibr" rid="ref42">42</xref>].</p>
                <p>The retrieval precision may be approximated by the classifier precision, but the estimation of retrieval recall can be different from the classifier recall. Classifier recall measures the model&#8217;s ability to correctly identify relevant content among the data retrieved by the search filters, whereas retrieval recall estimates how completely relevant content is captured by the search filters, relative to the universe of possible content (all Twitter messages in our example). The estimation of retrieval recall, therefore, is inherently theoretical because it is arduous and resource-intensive to sample unretrieved messages. In practice, its estimation involves examining unretrieved data from as many sources/repositories as possible. Our team collects and manages Twitter data in multiple archives to cover a broad range of topics related to tobacco products and associated behaviors; thus, we could sample from these other archives to see if they captured any content that is potentially relevant to e-cigarettes. Others may archive the Streaming API of Twitter or design another sampling strategy. The important point is to approximate as best as possible the universe in which relevant content may appear.</p>
            </sec>
            <sec>
                <title>Future Research</title>
                <p>In addition to data collection and quality assessment, it is important to report data sources, which can affect the validity of inference. Public data on Twitter can be accessed by Firehose, Search API, or Streaming API. The latter two have rate limits, which may prevent retrieval of full data depending on the volume of topics. A small random sample of full stream may contain abundant information about popular topics, for example, a movie star. Some topics may be so scarce in the Twitterverse that rate limit may not be an issue, but sudden spikes in tweet volume induced by, for example, policy change may not be captured due to rate limits. Further research is needed to investigate how the inference is affected by data sources and to provide guidelines. Regardless of data sources, in order to evaluate and compare results across studies, it is critical for researchers using social media data to clearly report how their data were collected and what assumptions were made about unretrieved data, and to provide estimates of the quality of their retrieved data. While strategies may vary by research topic and/or data availability, transparent and thorough reporting is crucial for the credibility of studies as well as the establishment of a rigorous standard for social media research.</p>
            </sec>
            <sec>
                <title> Limitations</title>
                <p>Our methods have certain limitations. We constructed an archive to store tweets potentially related to tobacco smoking. Such an archive is not a random sample of Twitterverse and thus induces selection bias; it may leave out a small fraction of relevant tweets (&#8220;e&#8221; in <xref ref-type="fig" rid="figure1">Figure 1</xref>). This selection bias affects the recall estimate via P(retr) and P(relevant&#124;unretr) in Equation (2). First, if the Twitterverse was used instead of the archive, P(retr) would be much smaller than 0.0208 due to a much larger denominator. This implies that the retrieval recall should be lower. On the other hand, the archive has a high chance of containing e-cigarette messages. That is, it is more likely to contain false negative contents than a random sample of the Twitterverse. Accordingly if the Twitterverse was used, P(relevant&#124;unretr) should be lower and is likely to have many leading zeros. This implies that the retrieval recall should be higher. The two components affect recall estimate in opposite directions. Although the archive has selection bias, it helps find false negative contents and refine the search filter. In addition, the ratio of retrieved to unretrieved messages is relatively larger in the archive than in the Twitterverse. Validating the search filter quality when this ratio is about 1/800 or smaller requires coders to evaluate an impractically huge number of tweets for reliable recall estimation (see Case 2 in <xref ref-type="app" rid="app1">Multimedia Appendix 1</xref>).</p>
            </sec>
            <sec>
                <title>Call for Rigorous Research</title>
                <p>The number of studies that rely on social media data is increasing [<xref ref-type="bibr" rid="ref43">43</xref>]. However, few have thoroughly described the search filter building process or fully assessed data quality. In order to assess data collection and quality, research involving social media data should clearly describe data sources, including how data were accessed and collected and how search filters were built, as well as presenting retrieval precision and recall. Data with low recall will poorly represent the target topic, and data with low precision will give misleading information. In light of moving toward a reporting standard, we propose a checklist (see <xref ref-type="boxed-text" rid="box1">Textbox 1</xref>) for reporting social media data preparation. Study findings should be replicable and comparable with clearly described data and methods.</p>
                <boxed-text id="box1" position="float">
                    <title>Checklist for social media data preparation and reporting.</title>
                    <p>1. Data source</p>
                    <list list-type="bullet">
                        <list-item>
                            <p>Social networking site and time frame</p>
                        </list-item>
                        <list-item>
                            <p>How the data are accessed (eg, Streaming API)</p>
                        </list-item>
                        <list-item>
                            <p>Why the data source is suitable for the research topics? Is there any limitation with the data source?</p>
                        </list-item>
                    </list>
                    <p>2. Development of search filter</p>
                    <list list-type="bullet">
                        <list-item>
                            <p>How candidate keywords are generated</p>
                        </list-item>
                        <list-item>
                            <p>How keywords are refined</p>
                        </list-item>
                        <list-item>
                            <p>Complete list of final keywords and search rules</p>
                        </list-item>
                    </list>
                    <p>3. Assessment of search filter</p>
                    <list list-type="bullet">
                        <list-item>
                            <p>Assumptions about human coding</p>
                        </list-item>
                        <list-item>
                            <p>Sampling frame and sample size for human coding</p>
                        </list-item>
                        <list-item>
                            <p>Whether all necessary data are available to assess the search filter</p>
                        </list-item>
                        <list-item>
                            <p>Whether and how retrieval precision and recall are estimated</p>
                        </list-item>
                    </list>
                </boxed-text>
            </sec>
            <sec>
                <title>Conclusions</title>
                <p>In this paper, we proposed a framework for social media data collection and validation and discussed how to quantify data quality under different conditions. Our proposed methodology is not limited to Twitter and can be adapted to other public social networking sites (as opposed to online forums or closed online networks). The length limit of posts, different data fields (title, description, tag, comment, etc), main user characteristics, data streaming, or crawling tools may be considered for modification. Our method is primarily useful for text-based social data, but it can be adapted to image-based social media. Instagram users, for instance, post photos with hashtags; we can retrieve potentially relevant contents based on hashtags [<xref ref-type="bibr" rid="ref44">44</xref>] and remove irrelevant contents by using an image classifier. We hope our proposed framework and methods contribute to more rigorous and transparent health research using social media data.</p>
            </sec>
        </sec>
    </body>
    <back>
        <app-group>
            <app id="app1">
                <title>Multimedia Appendix 1</title>
                <p>The sample size simulation for human coding.</p>
                <media xlink:href="jmir_v18i2e41_app1.pdf" xlink:title="PDF File (Adobe PDF File), 324KB" />
            </app>
            <app id="app2">
                <title>Multimedia Appendix 2</title>
                <p>Precision and recall estimation using only retrieved data.</p>
                <media xlink:href="jmir_v18i2e41_app2.pdf" xlink:title="PDF File (Adobe PDF File), 360KB" />
            </app>
            <app id="app3">
                <title>Multimedia Appendix 3</title>
                <p>Precision and recall estimation when human coding is not a standard classifier.</p>
                <media xlink:href="jmir_v18i2e41_app3.pdf" xlink:title="PDF File (Adobe PDF File), 373KB" />
            </app>
            <app id="app4">
                <title>Multimedia Appendix 4</title>
                <p>E-cigarette search keywords and rules.</p>
                <media xlink:href="jmir_v18i2e41_app4.pdf" xlink:title="PDF File (Adobe PDF File), 179KB" />
            </app>
        </app-group>
        <glossary>
            <title>Abbreviations</title>
            <def-list>
                <def-item>
                    <term id="abb1">API</term>
                    <def>
                        <p>application program interface</p>
                    </def>
                </def-item>
                <def-item>
                    <term id="abb2">FDA</term>
                    <def>
                        <p>Food and Drug Administration</p>
                    </def>
                </def-item>
                <def-item>
                    <term id="abb3">HD</term>
                    <def>
                        <p>highest density</p>
                    </def>
                </def-item>
                <def-item>
                    <term id="abb4">HPD</term>
                    <def>
                        <p>highest posterior density</p>
                    </def>
                </def-item>
                <def-item>
                    <term id="abb5">NIH</term>
                    <def>
                        <p>National Institutes of Health</p>
                    </def>
                </def-item>
                <def-item>
                    <term id="abb6">NPV</term>
                    <def>
                        <p>negative predictive value</p>
                    </def>
                </def-item>
            </def-list>
        </glossary>
        <ack>
            <p>We would like to thank the members of Health Media Collaboratory who helped collect and code massive Twitter data (Rachel Kornfield, Steven Binns, Lisa E Vera, Kristen Emory, Glen Szczypka, Eman H Aly, Hy Tran), and Lisa E Vera for editing the text. This paper would not have been possible without such teamwork.</p>
            <p>This study was supported by the NIH National Cancer Institute under Award Number U01CA154254 and FDA Center for Tobacco Products under Award Number P50CA179546. The content is solely the responsibility of the authors and does not necessarily represent the official views of the NIH or the FDA.</p>
        </ack>
        <fn-group>
            <fn fn-type="con">
                <p>All authors contributed to conceptualization, coding the data, and writing the text.</p>
            </fn>
            <fn fn-type="conflict">
                <p>None declared.</p>
            </fn>
        </fn-group>
        <ref-list>
            <ref id="ref1">
                <label>1</label>
                <nlm-citation citation-type="web">
                    <person-group person-group-type="author">
                        <name name-style="western">
                            <surname>Fox</surname>
                            <given-names>S</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Duggan</surname>
                            <given-names>M</given-names>
                        </name>
                    </person-group>
                    <source>Internet &#38; American Life Project</source>
                    <year>2013</year>
                    <month>01</month>
                    <access-date>2015-01-10</access-date>
                    <publisher-name>Pew Research Center</publisher-name>
                    <comment>Health online 2013<ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://www.pewinternet.org/2013/01/15/health-online-2013/">http://www.pewinternet.org/2013/01/15/health-online-2013/</ext-link>
                        <ext-link ext-link-type="webcite" xlink:href="6YeJ27UsV" />
                    </comment>
                </nlm-citation>
            </ref>
            <ref id="ref2">
                <label>2</label>
                <nlm-citation citation-type="journal">
                    <person-group person-group-type="author">
                        <name name-style="western">
                            <surname>Koch-Weser</surname>
                            <given-names>S</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Bradshaw</surname>
                            <given-names>YS</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Gualtieri</surname>
                            <given-names>L</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Gallagher</surname>
                            <given-names>SS</given-names>
                        </name>
                    </person-group>
                    <article-title>The Internet as a health information source: findings from the 2007 Health Information National Trends Survey and implications for health communication</article-title>
                    <source>J Health Commun</source>
                    <year>2010</year>
                    <volume>15 Suppl 3</volume>
                    <fpage>279</fpage>
                    <lpage>293</lpage>
                    <pub-id pub-id-type="doi">10.1080/10810730.2010.522700</pub-id>
                    <pub-id pub-id-type="medline">21154099</pub-id>
                    <pub-id pub-id-type="pii">930956751</pub-id>
                </nlm-citation>
            </ref>
            <ref id="ref3">
                <label>3</label>
                <nlm-citation citation-type="journal">
                    <person-group person-group-type="author">
                        <name name-style="western">
                            <surname>McCully</surname>
                            <given-names>SN</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Don</surname>
                            <given-names>BP</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Updegraff</surname>
                            <given-names>JA</given-names>
                        </name>
                    </person-group>
                    <article-title>Using the Internet to help with diet, weight, and physical activity: results from the Health Information National Trends Survey (HINTS)</article-title>
                    <source>J Med Internet Res</source>
                    <year>2013</year>
                    <volume>15</volume>
                    <issue>8</issue>
                    <fpage>e148</fpage>
                    <comment>
                        <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://www.jmir.org/2013/8/e148/" />
                    </comment>
                    <pub-id pub-id-type="doi">10.2196/jmir.2612</pub-id>
                    <pub-id pub-id-type="medline">23906945</pub-id>
                    <pub-id pub-id-type="pii">v15i8e148</pub-id>
                    <pub-id pub-id-type="pmcid">PMC3742401</pub-id>
                </nlm-citation>
            </ref>
            <ref id="ref4">
                <label>4</label>
                <nlm-citation citation-type="web">
                    <person-group person-group-type="author">
                        <name name-style="western">
                            <surname>Anderson</surname>
                            <given-names>M</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Caumont</surname>
                            <given-names>A</given-names>
                        </name>
                    </person-group>
                    <source>Fact Tank</source>
                    <year>2014</year>
                    <month>09</month>
                    <day>24</day>
                    <access-date>2015-05-18</access-date>
                    <publisher-name>Pew Research Center</publisher-name>
                    <comment>How social media is reshaping news<ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://www.pewresearch.org/fact-tank/2014/09/24/how-social-media-is-reshaping-news/">http://www.pewresearch.org/fact-tank/2014/09/24/how-social-media-is-reshaping-news/</ext-link>
                        <ext-link ext-link-type="webcite" xlink:href="6YeJY7Ho2" />
                    </comment>
                </nlm-citation>
            </ref>
            <ref id="ref5">
                <label>5</label>
                <nlm-citation citation-type="web">
                    <person-group person-group-type="author">
                        <name name-style="western">
                            <surname>Somaiya</surname>
                            <given-names>R</given-names>
                        </name>
                    </person-group>
                    <source>The New York Times</source>
                    <year>2014</year>
                    <month>10</month>
                    <day>26</day>
                    <access-date>2015-05-19</access-date>
                    <comment>How Facebook is changing the way its users consume journalism<ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://www.nytimes.com/2014/10/27/business/media/how-facebook-is-changing-the-way-its-users-consume-journalism.html">http://www.nytimes.com/2014/10/27/business/media/how-facebook-is-changing-the-way-its-users-consume-journalism.html</ext-link>
                        <ext-link ext-link-type="webcite" xlink:href="6YeJtn0H2" />
                    </comment>
                </nlm-citation>
            </ref>
            <ref id="ref6">
                <label>6</label>
                <nlm-citation citation-type="journal">
                    <person-group person-group-type="author">
                        <name name-style="western">
                            <surname>Eysenbach</surname>
                            <given-names>G</given-names>
                        </name>
                    </person-group>
                    <article-title>Infodemiology and infoveillance: framework for an emerging set of public health informatics methods to analyze search, communication and publication behavior on the Internet</article-title>
                    <source>J Med Internet Res</source>
                    <year>2009</year>
                    <volume>11</volume>
                    <issue>1</issue>
                    <fpage>e11</fpage>
                    <comment>
                        <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://www.jmir.org/2009/1/e11/" />
                    </comment>
                    <pub-id pub-id-type="doi">10.2196/jmir.1157</pub-id>
                    <pub-id pub-id-type="medline">19329408</pub-id>
                    <pub-id pub-id-type="pii">v11i1e11</pub-id>
                    <pub-id pub-id-type="pmcid">PMC2762766</pub-id>
                </nlm-citation>
            </ref>
            <ref id="ref7">
                <label>7</label>
                <nlm-citation citation-type="journal">
                    <person-group person-group-type="author">
                        <name name-style="western">
                            <surname>Chew</surname>
                            <given-names>C</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Eysenbach</surname>
                            <given-names>G</given-names>
                        </name>
                    </person-group>
                    <article-title>Pandemics in the age of Twitter: content analysis of Tweets during the 2009 H1N1 outbreak</article-title>
                    <source>PLoS One</source>
                    <year>2010</year>
                    <volume>5</volume>
                    <issue>11</issue>
                    <fpage>e14118</fpage>
                    <comment>
                        <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://dx.plos.org/10.1371/journal.pone.0014118" />
                    </comment>
                    <pub-id pub-id-type="doi">10.1371/journal.pone.0014118</pub-id>
                    <pub-id pub-id-type="medline">21124761</pub-id>
                    <pub-id pub-id-type="pmcid">PMC2993925</pub-id>
                </nlm-citation>
            </ref>
            <ref id="ref8">
                <label>8</label>
                <nlm-citation citation-type="journal">
                    <person-group person-group-type="author">
                        <name name-style="western">
                            <surname>Signorini</surname>
                            <given-names>A</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Segre</surname>
                            <given-names>AM</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Polgreen</surname>
                            <given-names>PM</given-names>
                        </name>
                    </person-group>
                    <article-title>The use of Twitter to track levels of disease activity and public concern in the U.S. during the influenza A H1N1 pandemic</article-title>
                    <source>PLoS One</source>
                    <year>2011</year>
                    <volume>6</volume>
                    <issue>5</issue>
                    <fpage>e19467</fpage>
                    <comment>
                        <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://dx.plos.org/10.1371/journal.pone.0019467" />
                    </comment>
                    <pub-id pub-id-type="doi">10.1371/journal.pone.0019467</pub-id>
                    <pub-id pub-id-type="medline">21573238</pub-id>
                    <pub-id pub-id-type="pii">PONE-D-10-02464</pub-id>
                    <pub-id pub-id-type="pmcid">PMC3087759</pub-id>
                </nlm-citation>
            </ref>
            <ref id="ref9">
                <label>9</label>
                <nlm-citation citation-type="journal">
                    <person-group person-group-type="author">
                        <name name-style="western">
                            <surname>Mysl&#237;n</surname>
                            <given-names>M</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Zhu</surname>
                            <given-names>S</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Chapman</surname>
                            <given-names>W</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Conway</surname>
                            <given-names>M</given-names>
                        </name>
                    </person-group>
                    <article-title>Using twitter to examine smoking behavior and perceptions of emerging tobacco products</article-title>
                    <source>J Med Internet Res</source>
                    <year>2013</year>
                    <volume>15</volume>
                    <issue>8</issue>
                    <fpage>e174</fpage>
                    <comment>
                        <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://www.jmir.org/2013/8/e174/" />
                    </comment>
                    <pub-id pub-id-type="doi">10.2196/jmir.2534</pub-id>
                    <pub-id pub-id-type="medline">23989137</pub-id>
                    <pub-id pub-id-type="pii">v15i8e174</pub-id>
                    <pub-id pub-id-type="pmcid">PMC3758063</pub-id>
                </nlm-citation>
            </ref>
            <ref id="ref10">
                <label>10</label>
                <nlm-citation citation-type="journal">
                    <person-group person-group-type="author">
                        <name name-style="western">
                            <surname>Heaivilin</surname>
                            <given-names>N</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Gerbert</surname>
                            <given-names>B</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Page</surname>
                            <given-names>JE</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Gibbs</surname>
                            <given-names>JL</given-names>
                        </name>
                    </person-group>
                    <article-title>Public health surveillance of dental pain via Twitter</article-title>
                    <source>J Dent Res</source>
                    <year>2011</year>
                    <month>09</month>
                    <volume>90</volume>
                    <issue>9</issue>
                    <fpage>1047</fpage>
                    <lpage>1051</lpage>
                    <comment>
                        <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/21768306" />
                    </comment>
                    <pub-id pub-id-type="doi">10.1177/0022034511415273</pub-id>
                    <pub-id pub-id-type="medline">21768306</pub-id>
                    <pub-id pub-id-type="pii">0022034511415273</pub-id>
                    <pub-id pub-id-type="pmcid">PMC3169887</pub-id>
                </nlm-citation>
            </ref>
            <ref id="ref11">
                <label>11</label>
                <nlm-citation citation-type="journal">
                    <person-group person-group-type="author">
                        <name name-style="western">
                            <surname>Kim</surname>
                            <given-names>AE</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Hopper</surname>
                            <given-names>T</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Simpson</surname>
                            <given-names>S</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Nonnemaker</surname>
                            <given-names>J</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Lieberman</surname>
                            <given-names>AJ</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Hansen</surname>
                            <given-names>H</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Guillory</surname>
                            <given-names>J</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Porter</surname>
                            <given-names>L</given-names>
                        </name>
                    </person-group>
                    <article-title>Using Twitter Data to Gain Insights into E-cigarette Marketing and Locations of Use: An Infoveillance Study</article-title>
                    <source>J Med Internet Res</source>
                    <year>2015</year>
                    <volume>17</volume>
                    <issue>11</issue>
                    <fpage>e251</fpage>
                    <comment>
                        <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://www.jmir.org/2015/11/e251/" />
                    </comment>
                    <pub-id pub-id-type="doi">10.2196/jmir.4466</pub-id>
                    <pub-id pub-id-type="medline">26545927</pub-id>
                    <pub-id pub-id-type="pii">v17i11e251</pub-id>
                    <pub-id pub-id-type="pmcid">PMC4642798</pub-id>
                </nlm-citation>
            </ref>
            <ref id="ref12">
                <label>12</label>
                <nlm-citation citation-type="journal">
                    <person-group person-group-type="author">
                        <name name-style="western">
                            <surname>Sanders-Jackson</surname>
                            <given-names>A</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Brown</surname>
                            <given-names>CG</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Prochaska</surname>
                            <given-names>JJ</given-names>
                        </name>
                    </person-group>
                    <article-title>Applying linguistic methods to understanding smoking-related conversations on Twitter</article-title>
                    <source>Tob Control</source>
                    <year>2015</year>
                    <month>03</month>
                    <volume>24</volume>
                    <issue>2</issue>
                    <fpage>136</fpage>
                    <lpage>138</lpage>
                    <comment>
                        <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/24227540" />
                    </comment>
                    <pub-id pub-id-type="doi">10.1136/tobaccocontrol-2013-051243</pub-id>
                    <pub-id pub-id-type="medline">24227540</pub-id>
                    <pub-id pub-id-type="pii">tobaccocontrol-2013-051243</pub-id>
                    <pub-id pub-id-type="pmcid">PMC4103964</pub-id>
                </nlm-citation>
            </ref>
            <ref id="ref13">
                <label>13</label>
                <nlm-citation citation-type="journal">
                    <person-group person-group-type="author">
                        <name name-style="western">
                            <surname>Zhang</surname>
                            <given-names>N</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Campo</surname>
                            <given-names>S</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Janz</surname>
                            <given-names>KF</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Eckler</surname>
                            <given-names>P</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Yang</surname>
                            <given-names>J</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Snetselaar</surname>
                            <given-names>LG</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Signorini</surname>
                            <given-names>A</given-names>
                        </name>
                    </person-group>
                    <article-title>Electronic word of mouth on twitter about physical activity in the United States: exploratory infodemiology study</article-title>
                    <source>J Med Internet Res</source>
                    <year>2013</year>
                    <volume>15</volume>
                    <issue>11</issue>
                    <fpage>e261</fpage>
                    <comment>
                        <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://www.jmir.org/2013/11/e261/" />
                    </comment>
                    <pub-id pub-id-type="doi">10.2196/jmir.2870</pub-id>
                    <pub-id pub-id-type="medline">24257325</pub-id>
                    <pub-id pub-id-type="pii">v15i11e261</pub-id>
                    <pub-id pub-id-type="pmcid">PMC3841353</pub-id>
                </nlm-citation>
            </ref>
            <ref id="ref14">
                <label>14</label>
                <nlm-citation citation-type="journal">
                    <person-group person-group-type="author">
                        <name name-style="western">
                            <surname>Collier</surname>
                            <given-names>N</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Son</surname>
                            <given-names>NT</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Nguyen</surname>
                            <given-names>NM</given-names>
                        </name>
                    </person-group>
                    <article-title>OMG U got flu? Analysis of shared health messages for bio-surveillance</article-title>
                    <source>J Biomed Semantics</source>
                    <year>2011</year>
                    <volume>2 Suppl 5</volume>
                    <fpage>S9</fpage>
                    <comment>
                        <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://jbiomedsem.biomedcentral.com/articles/10.1186/2041-1480-2-S5-S9" />
                    </comment>
                    <pub-id pub-id-type="doi">10.1186/2041-1480-2-S5-S9</pub-id>
                    <pub-id pub-id-type="medline">22166368</pub-id>
                    <pub-id pub-id-type="pii">2041-1480-2-S5-S9</pub-id>
                    <pub-id pub-id-type="pmcid">PMC3239309</pub-id>
                </nlm-citation>
            </ref>
            <ref id="ref15">
                <label>15</label>
                <nlm-citation citation-type="journal">
                    <person-group person-group-type="author">
                        <name name-style="western">
                            <surname>Cole-Lewis</surname>
                            <given-names>H</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Pugatch</surname>
                            <given-names>J</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Sanders</surname>
                            <given-names>A</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Varghese</surname>
                            <given-names>A</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Posada</surname>
                            <given-names>S</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Yun</surname>
                            <given-names>C</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Schwarz</surname>
                            <given-names>M</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Augustson</surname>
                            <given-names>E</given-names>
                        </name>
                    </person-group>
                    <article-title>Social Listening: A Content Analysis of E-Cigarette Discussions on Twitter</article-title>
                    <source>J Med Internet Res</source>
                    <year>2015</year>
                    <volume>17</volume>
                    <issue>10</issue>
                    <fpage>e243</fpage>
                    <comment>
                        <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://www.jmir.org/2015/10/e243/" />
                    </comment>
                    <pub-id pub-id-type="doi">10.2196/jmir.4969</pub-id>
                    <pub-id pub-id-type="medline">26508089</pub-id>
                    <pub-id pub-id-type="pii">v17i10e243</pub-id>
                    <pub-id pub-id-type="pmcid">PMC4642379</pub-id>
                </nlm-citation>
            </ref>
            <ref id="ref16">
                <label>16</label>
                <nlm-citation citation-type="journal">
                    <person-group person-group-type="author">
                        <name name-style="western">
                            <surname>Chan</surname>
                            <given-names>B</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Lopez</surname>
                            <given-names>A</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Sarkar</surname>
                            <given-names>U</given-names>
                        </name>
                    </person-group>
                    <article-title>The Canary in the Coal Mine Tweets: Social Media Reveals Public Perceptions of Non-Medical Use of Opioids</article-title>
                    <source>PLoS One</source>
                    <year>2015</year>
                    <volume>10</volume>
                    <issue>8</issue>
                    <fpage>e0135072</fpage>
                    <comment>
                        <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://dx.plos.org/10.1371/journal.pone.0135072" />
                    </comment>
                    <pub-id pub-id-type="doi">10.1371/journal.pone.0135072</pub-id>
                    <pub-id pub-id-type="medline">26252774</pub-id>
                    <pub-id pub-id-type="pii">PONE-D-14-56862</pub-id>
                    <pub-id pub-id-type="pmcid">PMC4529203</pub-id>
                </nlm-citation>
            </ref>
            <ref id="ref17">
                <label>17</label>
                <nlm-citation citation-type="journal">
                    <person-group person-group-type="author">
                        <name name-style="western">
                            <surname>Eichstaedt</surname>
                            <given-names>JC</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Schwartz</surname>
                            <given-names>HA</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Kern</surname>
                            <given-names>ML</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Park</surname>
                            <given-names>G</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Labarthe</surname>
                            <given-names>DR</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Merchant</surname>
                            <given-names>RM</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Jha</surname>
                            <given-names>S</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Agrawal</surname>
                            <given-names>M</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Dziurzynski</surname>
                            <given-names>LA</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Sap</surname>
                            <given-names>M</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Weeg</surname>
                            <given-names>C</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Larson</surname>
                            <given-names>EE</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Ungar</surname>
                            <given-names>LH</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Seligman</surname>
                            <given-names>MEP</given-names>
                        </name>
                    </person-group>
                    <article-title>Psychological language on Twitter predicts county-level heart disease mortality</article-title>
                    <source>Psychol Sci</source>
                    <year>2015</year>
                    <month>02</month>
                    <volume>26</volume>
                    <issue>2</issue>
                    <fpage>159</fpage>
                    <lpage>169</lpage>
                    <comment>
                        <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/25605707" />
                    </comment>
                    <pub-id pub-id-type="doi">10.1177/0956797614557867</pub-id>
                    <pub-id pub-id-type="medline">25605707</pub-id>
                    <pub-id pub-id-type="pii">0956797614557867</pub-id>
                    <pub-id pub-id-type="pmcid">PMC4433545</pub-id>
                </nlm-citation>
            </ref>
            <ref id="ref18">
                <label>18</label>
                <nlm-citation citation-type="journal">
                    <person-group person-group-type="author">
                        <name name-style="western">
                            <surname>Gittelman</surname>
                            <given-names>S</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Lange</surname>
                            <given-names>V</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Gotway Crawford</surname>
                            <given-names>CA</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Okoro</surname>
                            <given-names>CA</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Lieb</surname>
                            <given-names>E</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Dhingra</surname>
                            <given-names>SS</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Trimarchi</surname>
                            <given-names>E</given-names>
                        </name>
                    </person-group>
                    <article-title>A new source of data for public health surveillance: Facebook likes</article-title>
                    <source>J Med Internet Res</source>
                    <year>2015</year>
                    <volume>17</volume>
                    <issue>4</issue>
                    <fpage>e98</fpage>
                    <comment>
                        <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://www.jmir.org/2015/4/e98/" />
                    </comment>
                    <pub-id pub-id-type="doi">10.2196/jmir.3970</pub-id>
                    <pub-id pub-id-type="medline">25895907</pub-id>
                    <pub-id pub-id-type="pii">v17i4e98</pub-id>
                    <pub-id pub-id-type="pmcid">PMC4419195</pub-id>
                </nlm-citation>
            </ref>
            <ref id="ref19">
                <label>19</label>
                <nlm-citation citation-type="journal">
                    <person-group person-group-type="author">
                        <name name-style="western">
                            <surname>Kietzmann</surname>
                            <given-names>JH</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Hermkens</surname>
                            <given-names>K</given-names>
                        </name>
                        <name name-style="western">
                            <surname>McCarthy</surname>
                            <given-names>IP</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Silvestre</surname>
                            <given-names>BS</given-names>
                        </name>
                    </person-group>
                    <article-title>Social media? Get serious! Understanding the functional building blocks of social media</article-title>
                    <source>Business Horizons</source>
                    <year>2011</year>
                    <month>5</month>
                    <volume>54</volume>
                    <issue>3</issue>
                    <fpage>241</fpage>
                    <lpage>251</lpage>
                    <pub-id pub-id-type="doi">10.1016/j.bushor.2011.01.005</pub-id>
                </nlm-citation>
            </ref>
            <ref id="ref20">
                <label>20</label>
                <nlm-citation citation-type="journal">
                    <person-group person-group-type="author">
                        <name name-style="western">
                            <surname>Petrova</surname>
                            <given-names>M</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Sutcliffe</surname>
                            <given-names>P</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Fulford</surname>
                            <given-names>KWM</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Dale</surname>
                            <given-names>J</given-names>
                        </name>
                    </person-group>
                    <article-title>Search terms and a validated brief search filter to retrieve publications on health-related values in Medline: a word frequency analysis study</article-title>
                    <source>J Am Med Inform Assoc</source>
                    <year>2012</year>
                    <volume>19</volume>
                    <issue>3</issue>
                    <fpage>479</fpage>
                    <lpage>488</lpage>
                    <comment>
                        <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://jamia.oxfordjournals.org/cgi/pmidlookup?view=long&#38;pmid=21846778" />
                    </comment>
                    <pub-id pub-id-type="doi">10.1136/amiajnl-2011-000243</pub-id>
                    <pub-id pub-id-type="medline">21846778</pub-id>
                    <pub-id pub-id-type="pii">amiajnl-2011-000243</pub-id>
                    <pub-id pub-id-type="pmcid">PMC3341778</pub-id>
                </nlm-citation>
            </ref>
            <ref id="ref21">
                <label>21</label>
                <nlm-citation citation-type="journal">
                    <person-group person-group-type="author">
                        <name name-style="western">
                            <surname>Mollema</surname>
                            <given-names>L</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Harmsen</surname>
                            <given-names>IA</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Broekhuizen</surname>
                            <given-names>E</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Clijnk</surname>
                            <given-names>R</given-names>
                        </name>
                        <name name-style="western">
                            <surname>De</surname>
                            <given-names>MH</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Paulussen</surname>
                            <given-names>T</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Kok</surname>
                            <given-names>G</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Ruiter</surname>
                            <given-names>R</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Das</surname>
                            <given-names>E</given-names>
                        </name>
                    </person-group>
                    <article-title>Disease detection or public opinion reflection? Content analysis of tweets, other social media, and online newspapers during the measles outbreak in The Netherlands in 2013</article-title>
                    <source>J Med Internet Res</source>
                    <year>2015</year>
                    <volume>17</volume>
                    <issue>5</issue>
                    <fpage>e128</fpage>
                    <comment>
                        <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://www.jmir.org/2015/5/e128/" />
                    </comment>
                    <pub-id pub-id-type="doi">10.2196/jmir.3863</pub-id>
                    <pub-id pub-id-type="medline">26013683</pub-id>
                    <pub-id pub-id-type="pii">v17i5e128</pub-id>
                    <pub-id pub-id-type="pmcid">PMC4468573</pub-id>
                </nlm-citation>
            </ref>
            <ref id="ref22">
                <label>22</label>
                <nlm-citation citation-type="journal">
                    <person-group person-group-type="author">
                        <name name-style="western">
                            <surname>Adrover</surname>
                            <given-names>C</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Bodnar</surname>
                            <given-names>T</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Huang</surname>
                            <given-names>Z</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Telenti</surname>
                            <given-names>A</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Salath&#233;</surname>
                            <given-names>M</given-names>
                        </name>
                    </person-group>
                    <article-title>Identifying Adverse Effects of HIV Drug Treatment and Associated Sentiments Using Twitter</article-title>
                    <source>JMIR Public Health Surveill</source>
                    <year>2015</year>
                    <month>07</month>
                    <day>27</day>
                    <volume>1</volume>
                    <issue>2</issue>
                    <fpage>e7</fpage>
                    <comment>
                        <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://doi.org/10.2196/publichealth.4488" />
                    </comment>
                    <pub-id pub-id-type="doi">10.2196/publichealth.4488</pub-id>
                </nlm-citation>
            </ref>
            <ref id="ref23">
                <label>23</label>
                <nlm-citation citation-type="journal">
                    <person-group person-group-type="author">
                        <name name-style="western">
                            <surname>Huang</surname>
                            <given-names>J</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Kornfield</surname>
                            <given-names>R</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Szczypka</surname>
                            <given-names>G</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Emery</surname>
                            <given-names>SL</given-names>
                        </name>
                    </person-group>
                    <article-title>A cross-sectional examination of marketing of electronic cigarettes on Twitter</article-title>
                    <source>Tob Control</source>
                    <year>2014</year>
                    <month>07</month>
                    <volume>23 Suppl 3</volume>
                    <fpage>iii26</fpage>
                    <lpage>30</lpage>
                    <comment>
                        <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://tobaccocontrol.bmj.com/cgi/pmidlookup?view=long&#38;pmid=24935894" />
                    </comment>
                    <pub-id pub-id-type="doi">10.1136/tobaccocontrol-2014-051551</pub-id>
                    <pub-id pub-id-type="medline">24935894</pub-id>
                    <pub-id pub-id-type="pii">tobaccocontrol-2014-051551</pub-id>
                    <pub-id pub-id-type="pmcid">PMC4078681</pub-id>
                </nlm-citation>
            </ref>
            <ref id="ref24">
                <label>24</label>
                <nlm-citation citation-type="journal">
                    <person-group person-group-type="author">
                        <name name-style="western">
                            <surname>Ghosh</surname>
                            <given-names>DD</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Guha</surname>
                            <given-names>R</given-names>
                        </name>
                    </person-group>
                    <article-title>What are we 'tweeting' about obesity? Mapping tweets with Topic Modeling and Geographic Information System</article-title>
                    <source>Cartogr Geogr Inf Sci</source>
                    <year>2013</year>
                    <volume>40</volume>
                    <issue>2</issue>
                    <fpage>90</fpage>
                    <lpage>102</lpage>
                    <comment>
                        <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/25126022" />
                    </comment>
                    <pub-id pub-id-type="doi">10.1080/15230406.2013.776210</pub-id>
                    <pub-id pub-id-type="medline">25126022</pub-id>
                    <pub-id pub-id-type="pmcid">PMC4128420</pub-id>
                </nlm-citation>
            </ref>
            <ref id="ref25">
                <label>25</label>
                <nlm-citation citation-type="journal">
                    <person-group person-group-type="author">
                        <name name-style="western">
                            <surname>Stryker</surname>
                            <given-names>JE</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Wray</surname>
                            <given-names>RJ</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Hornik</surname>
                            <given-names>RC</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Yanovitzky</surname>
                            <given-names>I</given-names>
                        </name>
                    </person-group>
                    <article-title>Validation of Database Search Terms for Content Analysis: The Case of Cancer News Coverage</article-title>
                    <source>Journalism &#38; Mass Communication Quarterly</source>
                    <year>2006</year>
                    <month>06</month>
                    <day>01</day>
                    <volume>83</volume>
                    <issue>2</issue>
                    <fpage>413</fpage>
                    <lpage>430</lpage>
                    <pub-id pub-id-type="doi">10.1177/107769900608300212</pub-id>
                </nlm-citation>
            </ref>
            <ref id="ref26">
                <label>26</label>
                <nlm-citation citation-type="journal">
                    <person-group person-group-type="author">
                        <name name-style="western">
                            <surname>White</surname>
                            <given-names>VJ</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Glanville</surname>
                            <given-names>JM</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Lefebvre</surname>
                            <given-names>C</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Sheldon</surname>
                            <given-names>TA</given-names>
                        </name>
                    </person-group>
                    <article-title>A statistical approach to designing search filters to find systematic reviews: objectivity enhances accuracy</article-title>
                    <source>Journal of Information Science</source>
                    <year>2001</year>
                    <month>12</month>
                    <day>01</day>
                    <volume>27</volume>
                    <issue>6</issue>
                    <fpage>357</fpage>
                    <lpage>370</lpage>
                    <pub-id pub-id-type="doi">10.1177/016555150102700601</pub-id>
                </nlm-citation>
            </ref>
            <ref id="ref27">
                <label>27</label>
                <nlm-citation citation-type="journal">
                    <person-group person-group-type="author">
                        <name name-style="western">
                            <surname>Staquet</surname>
                            <given-names>M</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Rozencweig</surname>
                            <given-names>M</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Lee</surname>
                            <given-names>YJ</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Muggia</surname>
                            <given-names>FM</given-names>
                        </name>
                    </person-group>
                    <article-title>Methodology for the assessment of new dichotomous diagnostic tests</article-title>
                    <source>J Chronic Dis</source>
                    <year>1981</year>
                    <volume>34</volume>
                    <issue>12</issue>
                    <fpage>599</fpage>
                    <lpage>610</lpage>
                    <pub-id pub-id-type="medline">6458624</pub-id>
                </nlm-citation>
            </ref>
            <ref id="ref28">
                <label>28</label>
                <nlm-citation citation-type="confproc">
                    <person-group person-group-type="author">
                        <name name-style="western">
                            <surname>Shuai</surname>
                            <given-names>W</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Zhiyuan</surname>
                            <given-names>C</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Bing</surname>
                            <given-names>L</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Sherry</surname>
                            <given-names>E</given-names>
                        </name>
                    </person-group>
                    <article-title>Identifying Keywords for Searching Relevant Social Media Posts</article-title>
                    <year>2016</year>
                    <month>02</month>
                    <day>12</day>
                    <conf-name>30th AAAI Conference on Artificial Intelligence</conf-name>
                    <conf-date>Feb. 2016</conf-date>
                    <conf-loc>Phoenix, Arizona</conf-loc>
                    <publisher-name>AAAI-16</publisher-name>
                </nlm-citation>
            </ref>
            <ref id="ref29">
                <label>29</label>
                <nlm-citation citation-type="web">
                    <source>Urban Dictionary</source>
                    <access-date>2015-05-19</access-date>
                    <comment>
                        <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://www.urbandictionary.com/">http://www.urbandictionary.com/</ext-link>
                        <ext-link ext-link-type="webcite" xlink:href="6YhNIyRcN" />
                    </comment>
                </nlm-citation>
            </ref>
            <ref id="ref30">
                <label>30</label>
                <nlm-citation citation-type="book">
                    <person-group person-group-type="author">
                        <name name-style="western">
                            <surname>Gordis</surname>
                            <given-names>L</given-names>
                        </name>
                    </person-group>
                    <source>Epidemiology</source>
                    <year>2004</year>
                    <publisher-loc>Philadelphia</publisher-loc>
                    <publisher-name>Elsevier Saunders</publisher-name>
                </nlm-citation>
            </ref>
            <ref id="ref31">
                <label>31</label>
                <nlm-citation citation-type="book">
                    <person-group person-group-type="author">
                        <name name-style="western">
                            <surname>Liu</surname>
                            <given-names>B</given-names>
                        </name>
                    </person-group>
                    <source>Web Data Mining: Exploring Hyperlinks, Contents, and Usage Data (Data-Centric Systems and Applications)</source>
                    <year>2011</year>
                    <publisher-loc>Heidelberg</publisher-loc>
                    <publisher-name>Springer</publisher-name>
                </nlm-citation>
            </ref>
            <ref id="ref32">
                <label>32</label>
                <nlm-citation citation-type="book">
                    <person-group person-group-type="author">
                        <name name-style="western">
                            <surname>Gelman</surname>
                            <given-names>A</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Carlin</surname>
                            <given-names>J</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Stern</surname>
                            <given-names>H</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Rubin</surname>
                            <given-names>D</given-names>
                        </name>
                    </person-group>
                    <source>Bayesian Data Analysis, Second Edition</source>
                    <year>2003</year>
                    <publisher-loc>Boca Raton, FL</publisher-loc>
                    <publisher-name>Chapman &#38; Hall</publisher-name>
                </nlm-citation>
            </ref>
            <ref id="ref33">
                <label>33</label>
                <nlm-citation citation-type="book">
                    <person-group person-group-type="author">
                        <name name-style="western">
                            <surname>Agresti</surname>
                            <given-names>A</given-names>
                        </name>
                    </person-group>
                    <source>Categorical data analysis</source>
                    <year>2002</year>
                    <publisher-loc>New York</publisher-loc>
                    <publisher-name>Wiley-Interscience</publisher-name>
                </nlm-citation>
            </ref>
            <ref id="ref34">
                <label>34</label>
                <nlm-citation citation-type="journal">
                    <person-group person-group-type="author">
                        <name name-style="western">
                            <surname>Joseph</surname>
                            <given-names>L</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Gyorkos</surname>
                            <given-names>TW</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Coupal</surname>
                            <given-names>L</given-names>
                        </name>
                    </person-group>
                    <article-title>Bayesian estimation of disease prevalence and the parameters of diagnostic tests in the absence of a gold standard</article-title>
                    <source>Am J Epidemiol</source>
                    <year>1995</year>
                    <month>02</month>
                    <day>1</day>
                    <volume>141</volume>
                    <issue>3</issue>
                    <fpage>263</fpage>
                    <lpage>272</lpage>
                    <pub-id pub-id-type="medline">7840100</pub-id>
                </nlm-citation>
            </ref>
            <ref id="ref35">
                <label>35</label>
                <nlm-citation citation-type="web">
                    <source>Twitter Search</source>
                    <access-date>2016-02-03</access-date>
                    <comment>
                        <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="https://twitter.com/search-home">https://twitter.com/search-home</ext-link>
                        <ext-link ext-link-type="webcite" xlink:href="6f24ZCZqD" />
                    </comment>
                </nlm-citation>
            </ref>
            <ref id="ref36">
                <label>36</label>
                <nlm-citation citation-type="web">
                    <person-group person-group-type="author">
                        <name name-style="western">
                            <surname>Smith</surname>
                            <given-names>B</given-names>
                        </name>
                    </person-group>
                    <source>Bayesian output analysis program (BOA) Version 1.1 Users Manual</source>
                    <year>2005</year>
                    <month>01</month>
                    <day>08</day>
                    <access-date>2016-02-03</access-date>
                    <publisher-loc>Iowa City, IA</publisher-loc>
                    <publisher-name>University of Iowa</publisher-name>
                    <comment>
                        <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://www.public-health.uiowa.edu/boa/boa.pdf">http://www.public-health.uiowa.edu/boa/boa.pdf</ext-link>
                        <ext-link ext-link-type="webcite" xlink:href="6f24iO3OH" />
                    </comment>
                </nlm-citation>
            </ref>
            <ref id="ref37">
                <label>37</label>
                <nlm-citation citation-type="journal">
                    <person-group person-group-type="author">
                        <name name-style="western">
                            <surname>Fung</surname>
                            <given-names>IC</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Tse</surname>
                            <given-names>ZTH</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Cheung</surname>
                            <given-names>C</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Miu</surname>
                            <given-names>AS</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Fu</surname>
                            <given-names>K</given-names>
                        </name>
                    </person-group>
                    <article-title>Ebola and the social media</article-title>
                    <source>Lancet</source>
                    <year>2014</year>
                    <month>12</month>
                    <day>20</day>
                    <volume>384</volume>
                    <issue>9961</issue>
                    <fpage>2207</fpage>
                    <pub-id pub-id-type="doi">10.1016/S0140-6736(14)62418-1</pub-id>
                    <pub-id pub-id-type="medline">25625391</pub-id>
                    <pub-id pub-id-type="pii">S0140-6736(14)62418-1</pub-id>
                </nlm-citation>
            </ref>
            <ref id="ref38">
                <label>38</label>
                <nlm-citation citation-type="journal">
                    <person-group person-group-type="author">
                        <name name-style="western">
                            <surname>Lazer</surname>
                            <given-names>D</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Kennedy</surname>
                            <given-names>R</given-names>
                        </name>
                        <name name-style="western">
                            <surname>King</surname>
                            <given-names>G</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Vespignani</surname>
                            <given-names>A</given-names>
                        </name>
                    </person-group>
                    <article-title>Big data. The parable of Google Flu: traps in big data analysis</article-title>
                    <source>Science</source>
                    <year>2014</year>
                    <month>03</month>
                    <day>14</day>
                    <volume>343</volume>
                    <issue>6176</issue>
                    <fpage>1203</fpage>
                    <lpage>1205</lpage>
                    <pub-id pub-id-type="doi">10.1126/science.1248506</pub-id>
                    <pub-id pub-id-type="medline">24626916</pub-id>
                    <pub-id pub-id-type="pii">343/6176/1203</pub-id>
                </nlm-citation>
            </ref>
            <ref id="ref39">
                <label>39</label>
                <nlm-citation citation-type="journal">
                    <person-group person-group-type="author">
                        <name name-style="western">
                            <surname>Huang</surname>
                            <given-names>J</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Kim</surname>
                            <given-names>Y</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Vera</surname>
                            <given-names>L</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Emery</surname>
                            <given-names>SL</given-names>
                        </name>
                    </person-group>
                    <article-title>Electronic Cigarettes Among Priority Populations: Role of Smoking Cessation and Tobacco Control Policies</article-title>
                    <source>Am J Prev Med</source>
                    <year>2016</year>
                    <month>02</month>
                    <volume>50</volume>
                    <issue>2</issue>
                    <fpage>199</fpage>
                    <lpage>209</lpage>
                    <pub-id pub-id-type="doi">10.1016/j.amepre.2015.06.032</pub-id>
                    <pub-id pub-id-type="medline">26410185</pub-id>
                    <pub-id pub-id-type="pii">S0749-3797(15)00357-8</pub-id>
                    <pub-id pub-id-type="pmcid">PMC4718827</pub-id>
                </nlm-citation>
            </ref>
            <ref id="ref40">
                <label>40</label>
                <nlm-citation citation-type="journal">
                    <person-group person-group-type="author">
                        <name name-style="western">
                            <surname>Arrazola</surname>
                            <given-names>R</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Neff</surname>
                            <given-names>L</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Kennedy</surname>
                            <given-names>S</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Holder-Hayes</surname>
                            <given-names>E</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Jones</surname>
                            <given-names>C</given-names>
                        </name>
                    </person-group>
                    <article-title>Tobacco use among middle and high school students - United States, 2011-2014</article-title>
                    <source>Morbidity and Mortality Weekly Report</source>
                    <year>2014</year>
                    <month>11</month>
                    <day>14</day>
                    <volume>63</volume>
                    <issue>45</issue>
                    <fpage>1021</fpage>
                    <lpage>1026</lpage>
                    <comment>
                        <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://www.cdc.gov/mmwr/preview/mmwrhtml/mm6414a3.htm" />
                    </comment>
                </nlm-citation>
            </ref>
            <ref id="ref41">
                <label>41</label>
                <nlm-citation citation-type="book">
                    <person-group person-group-type="author">
                        <name name-style="western">
                            <surname>Mitchell</surname>
                            <given-names>T</given-names>
                        </name>
                    </person-group>
                    <source>Machine learning</source>
                    <year>1997</year>
                    <publisher-loc>New York</publisher-loc>
                    <publisher-name>McGraw-Hill</publisher-name>
                </nlm-citation>
            </ref>
            <ref id="ref42">
                <label>42</label>
                <nlm-citation citation-type="book">
                    <person-group person-group-type="author">
                        <name name-style="western">
                            <surname>Hastie</surname>
                            <given-names>T</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Tibshirani</surname>
                            <given-names>R</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Friedman</surname>
                            <given-names>J</given-names>
                        </name>
                    </person-group>
                    <source>The elements of statistical learning: data mining, inference and prediction</source>
                    <year>2009</year>
                    <publisher-loc>New York, NY</publisher-loc>
                    <publisher-name>Springer</publisher-name>
                </nlm-citation>
            </ref>
            <ref id="ref43">
                <label>43</label>
                <nlm-citation citation-type="journal">
                    <person-group person-group-type="author">
                        <name name-style="western">
                            <surname>Capurro</surname>
                            <given-names>D</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Cole</surname>
                            <given-names>K</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Echavarr&#237;a</surname>
                            <given-names>MI</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Joe</surname>
                            <given-names>J</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Neogi</surname>
                            <given-names>T</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Turner</surname>
                            <given-names>AM</given-names>
                        </name>
                    </person-group>
                    <article-title>The use of social networking sites for public health practice and research: a systematic review</article-title>
                    <source>J Med Internet Res</source>
                    <year>2014</year>
                    <volume>16</volume>
                    <issue>3</issue>
                    <fpage>e79</fpage>
                    <comment>
                        <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://www.jmir.org/2014/3/e79/" />
                    </comment>
                    <pub-id pub-id-type="doi">10.2196/jmir.2679</pub-id>
                    <pub-id pub-id-type="medline">24642014</pub-id>
                    <pub-id pub-id-type="pii">v16i3e79</pub-id>
                    <pub-id pub-id-type="pmcid">PMC3971364</pub-id>
                </nlm-citation>
            </ref>
            <ref id="ref44">
                <label>44</label>
                <nlm-citation citation-type="journal">
                    <person-group person-group-type="author">
                        <name name-style="western">
                            <surname>Chu</surname>
                            <given-names>K</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Sidhu</surname>
                            <given-names>A</given-names>
                        </name>
                        <name name-style="western">
                            <surname>Valente</surname>
                            <given-names>T</given-names>
                        </name>
                    </person-group>
                    <article-title>Electronic Cigarette Marketing Online: a Multi-Site, Multi-Product Comparison</article-title>
                    <source>JMIR Public Health Surveill</source>
                    <year>2015</year>
                    <month>09</month>
                    <day>11</day>
                    <volume>1</volume>
                    <issue>2</issue>
                    <fpage>e11</fpage>
                    <comment>
                        <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://doi.org/10.2196/publichealth.4777" />
                    </comment>
                    <pub-id pub-id-type="doi">10.2196/publichealth.4777</pub-id>
                </nlm-citation>
            </ref>
        </ref-list>
    </back>
</article>