<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" article-type="letter" dtd-version="2.0">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JMIR</journal-id>
      <journal-id journal-id-type="nlm-ta">J Med Internet Res</journal-id>
      <journal-title>Journal of Medical Internet Research</journal-title>
      <issn pub-type="epub">1438-8871</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
    <article-id pub-id-type="publisher-id">v18i8e219</article-id>
    <article-id pub-id-type="pmid">27507563</article-id>
    <article-id pub-id-type="doi">10.2196/jmir.6185</article-id>
    <article-categories>
      <subj-group subj-group-type="heading">
        <subject>Letter to the Editor</subject>
      </subj-group>
      <subj-group subj-group-type="article-type">
        <subject>Letter to the Editor</subject>
      </subj-group>
    </article-categories>
    <title-group>
      <article-title>The Importance of Debiasing Social Media Data to Better Understand E-Cigarette-Related Attitudes and Behaviors</article-title>
    </title-group>
    <contrib-group>
      <contrib contrib-type="editor">
        <name>
          <surname>Bamidis</surname>
          <given-names>Panagiotis</given-names>
        </name>
      </contrib>
    </contrib-group>
    <contrib-group>
      <contrib contrib-type="reviewer">
        <name>
          <surname>Benton</surname>
          <given-names>Adrian</given-names>
        </name>
      </contrib>
      <contrib contrib-type="reviewer">
        <name>
          <surname>Fernandez-Luque</surname>
          <given-names>Luis</given-names>
        </name>
      </contrib>
    </contrib-group>
    <contrib-group>
      <contrib contrib-type="author" id="contrib1" corresp="yes" equal-contrib="yes">
      <name name-style="western">
        <surname>Allem</surname>
        <given-names>Jon-Patrick</given-names>
      </name>
      <degrees>MA, PhD</degrees>
      <xref rid="aff1" ref-type="aff">1</xref>
      <address>
        <institution>Keck School of Medicine</institution>
        <institution>Department of Preventive Medicine</institution>
        <institution>University of Southern California</institution>
        <addr-line>2001 N. Soto Street, 3rd Floor Mail</addr-line>
        <addr-line>Los Angeles, CA, 90032</addr-line>
        <country>United States</country>
        <phone>1 8586030812</phone>
        <fax>1 3234428201</fax>
        <email>allem@usc.edu</email>
      </address>  
      <ext-link ext-link-type="orcid">http://orcid.org/0000-0001-9135-8689</ext-link></contrib>
      <contrib contrib-type="author" id="contrib2" equal-contrib="yes">
        <name name-style="western">
          <surname>Ferrara</surname>
          <given-names>Emilio</given-names>
        </name>
        <degrees>PhD</degrees>
        <xref rid="aff2" ref-type="aff">2</xref>
        <ext-link ext-link-type="orcid">http://orcid.org/0000-0002-1942-2831</ext-link>
      </contrib>
    </contrib-group>
    <aff id="aff1">
    <sup>1</sup>
    <institution>Keck School of Medicine</institution>
    <institution>Department of Preventive Medicine</institution>  
    <institution>University of Southern California</institution>  
    <addr-line>Los Angeles, CA</addr-line>
    <country>United States</country></aff>
    <aff id="aff2">
    <sup>2</sup>
    <institution>Information Sciences Institute</institution>
    <institution>Department of Computer Science</institution>  
    <institution>University of Southern California</institution>  
    <addr-line>Los Angeles, CA</addr-line>
    <country>United States</country></aff>
    <author-notes>
      <corresp>Corresponding Author: Jon-Patrick Allem 
      <email>allem@usc.edu</email></corresp>
    </author-notes>
    <pub-date pub-type="collection"><month>08</month><year>2016</year></pub-date>
    <pub-date pub-type="epub">
      <day>09</day>
      <month>08</month>
      <year>2016</year>
    </pub-date>
    <volume>18</volume>
    <issue>8</issue>
    <elocation-id>e219</elocation-id>
    <!--history from ojs - api-xml-->
    <history>
      <date date-type="received">
        <day>9</day>
        <month>6</month>
        <year>2016</year>
      </date>
      <date date-type="rev-request">
        <day>15</day>
        <month>7</month>
        <year>2016</year>
      </date>
      <date date-type="accepted">
        <day>27</day>
        <month>7</month>
        <year>2016</year>
      </date>
    </history>
    <!--(c) the authors - correct author names and publication date here if necessary. Date in form ', dd.mm.yyyy' after jmir.org-->
    <copyright-statement>©Jon-Patrick Allem, Emilio Ferrara. Originally published in the Journal of Medical Internet Research (http://www.jmir.org), 09.08.2016.</copyright-statement>
    <copyright-year>2016</copyright-year>
    <license license-type="open-access" xlink:href="http://creativecommons.org/licenses/by/2.0/">
      <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (http://creativecommons.org/licenses/by/2.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in the Journal of Medical Internet Research, is properly cited. The complete bibliographic information, a link to the original publication on http://www.jmir.org/, as well as this copyright and license information must be included.</p>
    </license>  
    <self-uri xlink:href="http://www.jmir.org/2016/8/e219/" xlink:type="simple"/>
    <related-article related-article-type="commentary-article" id="v18i2e41" ext-link-type="doi" xlink:href="10.2196/jmir.4738"  vol="18" page="e41" xlink:type="simple">http://www.jmir.org/2016/2/e41/</related-article>

<related-article related-article-type="commentary" vol="19" page="e165" xlink:href="http://www.jmir.org/2017/6/e165/" xlink:type="simple"/>




    <kwd-group>
      <kwd>Internet</kwd>
      <kwd>surveillance</kwd>
      <kwd>electronic cigarettes</kwd>
      <kwd>Twitter</kwd>
      <kwd>social media</kwd>
    </kwd-group></article-meta>
  </front>
  <body>
    <p>In a recent issue of <italic>JMIR,</italic> Kim and colleagues described a framework for data collection, quality assessment, and reporting standards for social media data used in health research [<xref ref-type="bibr" rid="ref1">1</xref>]. The authors’ framework was based on two principles: retrieval precision or “how much of retrieved data is relevant” and retrieval recall or “how much of the relevant data is retrieved.” With an in-depth knowledge of the subject matter under investigation, and refinement of the keywords to develop reliable search filters, the authors suggested that irrelevant content could be weeded out and high-quality data collection could be assured. Using the topic of electronic cigarettes (e-cigarettes), discussed on Twitter, as a case study to showcase their framework, the authors demonstrated how reporting standards could be made systematic and transparent. While the authors cogently argued for better reporting standards in social media data used in health research, and their principles regarding retrieval precision and retrieval recall were thoughtfully laid out, they overlooked the importance of identifying the sources of the content being captured during data collection. For example, Twitter has quickly become subject to third party manipulation where automated accounts are created by industry groups and private companies that aim to influence discussions and promote specific ideas or products [<xref ref-type="bibr" rid="ref2">2</xref>]. This fact is absent from the framework of Kim and colleagues [<xref ref-type="bibr" rid="ref1">1</xref>] and according to their principle of retrieval precision, researchers could classify tweets about e-cigarettes as high-quality data regardless of its origin.</p>
    <p>Recent research has suggested that between 70% and 80% of tweets mentioning e-cigarettes stem from automated accounts [<xref ref-type="bibr" rid="ref3">3</xref>]. Studies using tweets and that aimed at gaining insights to individual-level attitudes and behaviors are now faced with data with substantial bias and noise. Any results drawn upon this data and not preprocessed with de-noising techniques lose validity and significance. To ignore this bias in Twitter data would be akin to a public health researcher ignoring the bias from having a sample of participants, in a survey-based study on tobacco-related attitudes, where 700 of the 1000 participants happened to be gainfully employed by a tobacco company. The survey researcher would be forced to rethink their sampling frame, and the same dilemma applies to the social media researcher relying on Twitter as their data source. We propose herein that appropriate analyses be implemented to obtain valid data sets that remove sources of bias and noise before applying the framework of Kim and colleagues.</p>
    <p>Twitter screen names responsible for each tweet collected in a data set should be obtained and each account’s recent history, interactions, and metadata should be analyzed to determine whether the account is a social bot, a computer algorithm designed to automatically produce content and engage with humans on Twitter [<xref ref-type="bibr" rid="ref2">2</xref>]. These social bots are meant to appear to be individuals operating Twitter accounts that are complete with metadata (name, location, pithy quote) and a photo or an image. Tweets from these accounts pollute social and health research data sets and need to be identified and removed. Programs like “Bot Or Not?” [<xref ref-type="bibr" rid="ref2">2</xref>] use a classification system that groups each Twitter account’s features into 6 main classes: Network (diffusion patterns), User (metadata), Friends (account’s contacts), Temporal (tweet rate), and Sentiment (content of message). This classification system ultimately generates a score that falls on a spectrum that can then be used to determine the likelihood of any one account being a social bot. If an account is identified as a social bot then that account and any tweets produced from that account should be removed from the dataset. This platform is freely available, easy to use, and has shown to be successful in reducing bias and noise in datasets from earlier studies led by computer scientists [<xref ref-type="bibr" rid="ref2">2</xref>].</p>
    <p>Using Twitter to examine e-cigarette-related discussion is a novel approach; however, the signal-to-noise ratio has become increasingly low [<xref ref-type="bibr" rid="ref3">3</xref>]. In other words, the ratio of information representative of individuals’ perceptions, sentiments, and behavior is low as compared with the content from social bots. Prior studies have attempted to increase the signal-to-noise ratio by employing crude techniques (eg, removing any tweet that is accompanied by a URL [<xref ref-type="bibr" rid="ref4">4</xref>]. However, this approach and other blunt approaches (eg, methods solely relying on community detection or methods solely relying on innocent by association paradigms—an account interacting with a human user is considered human) result in misclassification (eg, the removal of a valid tweet from the data set simply because it was accompanied by a URL or keeping an invalid tweet because a human interacted with the account it originated from) [<xref ref-type="bibr" rid="ref5">5</xref>]. The debiasing techniques available to social media researchers proposed herein can be used to overcome earlier limitations.</p>
    <p>Social bots are only one source of bias in studies of Twitter posts. For example, the population of Twitter users over represents young people and ethnic minority groups, when compared to the general population in the United States. This source of bias cannot be easily resolved by machine algorithms and correcting such biases should be a focus of future research. The use of social bots are not confined to discussions of e-cigarettes but have been found to infiltrate political discourse, manipulate the stock market, acquire personal information, and disseminate misinformation [<xref ref-type="bibr" rid="ref5">5</xref>]. “Bot or Not?” is not a perfect system for bot detection, however, it scores a detection accuracy above 95% suggesting biases from inappropriate removal of legitimate accounts is minimal especially when compared with earlier approaches [<xref ref-type="bibr" rid="ref5">5</xref>]. Researchers need to take advantage of the resources designed to reliably identify and remove third party accounts responsible for the noise in social media data. Once debiasing techniques have been exploited, frameworks for data collection, quality assessment, and reporting standards for social media data used in health research should be employed.</p>
  </body>
  <back>
    <ack>
      <p>Research reported in this publication was supported by Grant # P50CA180905 from the National Cancer Institute and the FDA Center for Tobacco Products (CTP). The NIH or FDA had no role in study design, collection, analysis, and interpretation of data, writing the report, and the decision to submit the report for publication. The content is solely the responsibility of the authors and does not necessarily represent the official views of the NIH or FDA.</p>
    </ack>
    <fn-group>
      <fn fn-type="conflict">
        <p>None declared.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Kim</surname>
            <given-names>Y</given-names>
          </name>
          <name name-style="western">
            <surname>Huang</surname>
            <given-names>J</given-names>
          </name>
          <name name-style="western">
            <surname>Emery</surname>
            <given-names>S</given-names>
          </name>
        </person-group>
        <article-title>Garbage in, Garbage Out: Data Collection, Quality Assessment and Reporting Standards for Social Media Data Use in Health Research, Infodemiology and Digital Disease Detection</article-title>
        <source>J Med Internet Res</source>  
        <year>2016</year>  
        <volume>18</volume>  
        <issue>2</issue>  
        <fpage>e41</fpage>  
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://www.jmir.org/2016/2/e41/"/>
        </comment>  
        <pub-id pub-id-type="doi">10.2196/jmir.4738</pub-id>
        <pub-id pub-id-type="medline">26920122</pub-id>
        <pub-id pub-id-type="pii">v18i2e41</pub-id>
        <pub-id pub-id-type="pmcid">PMC4788740</pub-id></nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="confproc">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Davis</surname>
            <given-names>CA</given-names>
          </name>
          <name name-style="western">
            <surname>Varol</surname>
            <given-names>O</given-names>
          </name>
          <name name-style="western">
            <surname>Ferrara</surname>
            <given-names>E</given-names>
          </name>
          <name name-style="western">
            <surname>Flammini</surname>
            <given-names>A</given-names>
          </name>
          <name name-style="western">
            <surname>Menczer</surname>
            <given-names>F</given-names>
          </name>
        </person-group>
        <article-title>Botornot: A system to evaluate social bots</article-title>
 
        <conf-name>The 25th International Conference Companion on World Wide Web</conf-name>
        <conf-date>2016</conf-date>
        <conf-loc>Montreal, Canada</conf-loc>
        <fpage>273</fpage>  
        <lpage>274</lpage> </nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Clark</surname>
            <given-names>EM</given-names>
          </name>
          <name name-style="western">
            <surname>Jones</surname>
            <given-names>CA</given-names>
          </name>
          <name name-style="western">
            <surname>Williams</surname>
            <given-names>JR</given-names>
          </name>
          <name name-style="western">
            <surname>Kurti</surname>
            <given-names>AN</given-names>
          </name>
          <name name-style="western">
            <surname>Norotsky</surname>
            <given-names>MC</given-names>
          </name>
          <name name-style="western">
            <surname>Danforth</surname>
            <given-names>CM</given-names>
          </name>
          <name name-style="western">
            <surname>Dodds</surname>
            <given-names>PS</given-names>
          </name>
        </person-group>
        <article-title>Vaporous Marketing: Uncovering Pervasive Electronic Cigarette Advertisements on Twitter</article-title>
        <source>PLoS One</source>  
        <year>2016</year>  
        <volume>11</volume>  
        <issue>7</issue>  
        <fpage>e0157304</fpage>  
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://dx.plos.org/10.1371/journal.pone.0157304"/>
        </comment>  
        <pub-id pub-id-type="doi">10.1371/journal.pone.0157304</pub-id>
        <pub-id pub-id-type="medline">27410031</pub-id>
        <pub-id pub-id-type="pii">PONE-D-15-39861</pub-id></nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Huang</surname>
            <given-names>J</given-names>
          </name>
          <name name-style="western">
            <surname>Kornfield</surname>
            <given-names>R</given-names>
          </name>
          <name name-style="western">
            <surname>Szczypka</surname>
            <given-names>G</given-names>
          </name>
          <name name-style="western">
            <surname>Emery</surname>
            <given-names>SL</given-names>
          </name>
        </person-group>
        <article-title>A cross-sectional examination of marketing of electronic cigarettes on Twitter</article-title>
        <source>Tob Control</source>  
        <year>2014</year>  
        <month>07</month>  
        <volume>23 Suppl 3</volume>  
        <fpage>iii26</fpage>  
        <lpage>30</lpage>  
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://tobaccocontrol.bmj.com/cgi/pmidlookup?view=long&#38;pmid=24935894"/>
        </comment>  
        <pub-id pub-id-type="doi">10.1136/tobaccocontrol-2014-051551</pub-id>
        <pub-id pub-id-type="medline">24935894</pub-id>
        <pub-id pub-id-type="pii">tobaccocontrol-2014-051551</pub-id>
        <pub-id pub-id-type="pmcid">PMC4078681</pub-id></nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Ferrara</surname>
            <given-names>E</given-names>
          </name>
          <name name-style="western">
            <surname>Varol</surname>
            <given-names>O</given-names>
          </name>
          <name name-style="western">
            <surname>Davis</surname>
            <given-names>C</given-names>
          </name>
          <name name-style="western">
            <surname>Menczer</surname>
            <given-names>F</given-names>
          </name>
          <name name-style="western">
            <surname>Flammini</surname>
            <given-names>A</given-names>
          </name>
        </person-group>
        <article-title>The rise of social bots</article-title>
        <source>Commun. ACM</source>  
        <year>2016</year>  
        <month>06</month>  
        <day>24</day>  
        <volume>59</volume>  
        <issue>7</issue>  
        <fpage>96</fpage>  
        <lpage>104</lpage>  
        <pub-id pub-id-type="doi">10.1145/2818717</pub-id></nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>