<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article" dtd-version="2.0">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JMIR</journal-id>
      <journal-id journal-id-type="nlm-ta">J Med Internet Res</journal-id>
      <journal-title>Journal of Medical Internet Research</journal-title>
      <issn pub-type="epub">1438-8871</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
      <article-id pub-id-type="publisher-id">v22i8e17478</article-id>
      <article-id pub-id-type="pmid">32784184</article-id>
      <article-id pub-id-type="doi">10.2196/17478</article-id>
      <article-categories>
        <subj-group subj-group-type="heading">
          <subject>Original Paper</subject>
        </subj-group>
        <subj-group subj-group-type="article-type">
          <subject>Original Paper</subject>
        </subj-group>
      </article-categories>
      <title-group>
        <article-title>Machine Learning Classifiers for Twitter Surveillance of Vaping: Comparative Machine Learning Study</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="editor">
          <name>
            <surname>Eysenbach</surname>
            <given-names>Gunther</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Gore</surname>
            <given-names>Ross</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Binger</surname>
            <given-names>Kole</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Lambert</surname>
            <given-names>Natalie</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib id="contrib1" contrib-type="author" corresp="yes">
          <name name-style="western">
            <surname>Visweswaran</surname>
            <given-names>Shyam</given-names>
          </name>
          <degrees>MD, PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <address>
            <institution>Department of Biomedical Informatics</institution>
            <institution>University of Pittsburgh</institution>
            <addr-line>The Offices at Baum</addr-line>
            <addr-line>5607 Baum Blvd, Suite 523</addr-line>
            <addr-line>Pittsburgh, PA, 15206</addr-line>
            <country>United States</country>
            <phone>1 (412) 648 7119</phone>
            <email>shv3@pitt.edu</email>
          </address>
          <xref rid="aff2" ref-type="aff">2</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-2079-8684</ext-link>
        </contrib>
        <contrib id="contrib2" contrib-type="author">
          <name name-style="western">
            <surname>Colditz</surname>
            <given-names>Jason B</given-names>
          </name>
          <degrees>MEd</degrees>
          <xref rid="aff3" ref-type="aff">3</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-2811-841X</ext-link>
        </contrib>
        <contrib id="contrib3" contrib-type="author">
          <name name-style="western">
            <surname>O’Halloran</surname>
            <given-names>Patrick</given-names>
          </name>
          <degrees>MS</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-0710-7946</ext-link>
        </contrib>
        <contrib id="contrib4" contrib-type="author">
          <name name-style="western">
            <surname>Han</surname>
            <given-names>Na-Rae</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff4" ref-type="aff">4</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-8771-0173</ext-link>
        </contrib>
        <contrib id="contrib5" contrib-type="author">
          <name name-style="western">
            <surname>Taneja</surname>
            <given-names>Sanya B</given-names>
          </name>
          <degrees>MS</degrees>
          <xref rid="aff2" ref-type="aff">2</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-1707-1617</ext-link>
        </contrib>
        <contrib id="contrib6" contrib-type="author">
          <name name-style="western">
            <surname>Welling</surname>
            <given-names>Joel</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff5" ref-type="aff">5</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-1423-7001</ext-link>
        </contrib>
        <contrib id="contrib7" contrib-type="author">
          <name name-style="western">
            <surname>Chu</surname>
            <given-names>Kar-Hai</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff3" ref-type="aff">3</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-2486-8846</ext-link>
        </contrib>
        <contrib id="contrib8" contrib-type="author">
          <name name-style="western">
            <surname>Sidani</surname>
            <given-names>Jaime E</given-names>
          </name>
          <degrees>PhD, MPH</degrees>
          <xref rid="aff3" ref-type="aff">3</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-5411-8755</ext-link>
        </contrib>
        <contrib id="contrib9" contrib-type="author">
          <name name-style="western">
            <surname>Primack</surname>
            <given-names>Brian A</given-names>
          </name>
          <degrees>PhD, MD</degrees>
          <xref rid="aff6" ref-type="aff">6</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-5962-0939</ext-link>
        </contrib>
      </contrib-group>
      <aff id="aff1">
        <label>1</label>
        <institution>Department of Biomedical Informatics</institution>
        <institution>University of Pittsburgh</institution>
        <addr-line>Pittsburgh, PA</addr-line>
        <country>United States</country>
      </aff>
      <aff id="aff2">
        <label>2</label>
        <institution>Intelligent Systems Program</institution>
        <institution>University of Pittsburgh</institution>
        <addr-line>Pittsburgh, PA</addr-line>
        <country>United States</country>
      </aff>
      <aff id="aff3">
        <label>3</label>
        <institution>School of Medicine</institution>
        <institution>University of Pittsburgh</institution>
        <addr-line>Pittsburgh, PA</addr-line>
        <country>United States</country>
      </aff>
      <aff id="aff4">
        <label>4</label>
        <institution>Department of Linguistics</institution>
        <institution>University of Pittsburgh</institution>
        <addr-line>Pittsburgh, PA</addr-line>
        <country>United States</country>
      </aff>
      <aff id="aff5">
        <label>5</label>
        <institution>Pittsburgh Supercomputing Center</institution>
        <institution>Carnegie Mellon University</institution>
        <addr-line>Pittsburgh, PA</addr-line>
        <country>United States</country>
      </aff>
      <aff id="aff6">
        <label>6</label>
        <institution>College of Education and Health Professions</institution>
        <institution>University of Arkansas</institution>
        <addr-line>Fayetteville, AR</addr-line>
        <country>United States</country>
      </aff>
      <author-notes>
        <corresp>Corresponding Author: Shyam Visweswaran <email>shv3@pitt.edu</email></corresp>
      </author-notes>
      <pub-date pub-type="collection">
        <month>8</month>
        <year>2020</year>
      </pub-date>
      <pub-date pub-type="epub">
        <day>12</day>
        <month>8</month>
        <year>2020</year>
      </pub-date>
      <volume>22</volume>
      <issue>8</issue>
      <elocation-id>e17478</elocation-id>
      <history>
        <date date-type="received">
          <day>16</day>
          <month>12</month>
          <year>2019</year>
        </date>
        <date date-type="rev-request">
          <day>10</day>
          <month>3</month>
          <year>2020</year>
        </date>
        <date date-type="rev-recd">
          <day>5</day>
          <month>6</month>
          <year>2020</year>
        </date>
        <date date-type="accepted">
          <day>11</day>
          <month>6</month>
          <year>2020</year>
        </date>
      </history>
      <copyright-statement>©Shyam Visweswaran, Jason B Colditz, Patrick O’Halloran, Na-Rae Han, Sanya B Taneja, Joel Welling, Kar-Hai Chu, Jaime E Sidani, Brian A Primack. Originally published in the Journal of Medical Internet Research (http://www.jmir.org), 12.08.2020.</copyright-statement>
      <copyright-year>2020</copyright-year>
      <license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
        <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (https://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in the Journal of Medical Internet Research, is properly cited. The complete bibliographic information, a link to the original publication on http://www.jmir.org/, as well as this copyright and license information must be included.</p>
      </license>
      <self-uri xlink:href="https://www.jmir.org/2020/8/e17478" xlink:type="simple"/>
      <abstract>
        <sec sec-type="background">
          <title>Background</title>
          <p>Twitter presents a valuable and relevant social media platform to study the prevalence of information and sentiment on vaping that may be useful for public health surveillance. Machine learning classifiers that identify vaping-relevant tweets and characterize sentiments in them can underpin a Twitter-based vaping surveillance system. Compared with traditional machine learning classifiers that are reliant on annotations that are expensive to obtain, deep learning classifiers offer the advantage of requiring fewer annotated tweets by leveraging the large numbers of readily available unannotated tweets.</p>
        </sec>
        <sec sec-type="objective">
          <title>Objective</title>
          <p>This study aims to derive and evaluate traditional and deep learning classifiers that can identify tweets relevant to vaping, tweets of a commercial nature, and tweets with provape sentiments.</p>
        </sec>
        <sec sec-type="methods">
          <title>Methods</title>
          <p>We continuously collected tweets that matched vaping-related keywords over 2 months from August 2018 to October 2018. From this data set of tweets, a set of 4000 tweets was selected, and each tweet was manually annotated for relevance (vape relevant or not), commercial nature (commercial or not), and sentiment (provape or not). Using the annotated data, we derived traditional classifiers that included logistic regression, random forest, linear support vector machine, and multinomial naive Bayes. In addition, using the annotated data set and a larger unannotated data set of tweets, we derived deep learning classifiers that included a convolutional neural network (CNN), long short-term memory (LSTM) network, LSTM-CNN network, and bidirectional LSTM (BiLSTM) network. The unannotated tweet data were used to derive word vectors that deep learning classifiers can leverage to improve performance.</p>
        </sec>
        <sec sec-type="results">
          <title>Results</title>
          <p>LSTM-CNN performed the best with the highest area under the receiver operating characteristic curve (AUC) of 0.96 (95% CI 0.93-0.98) for relevance, all deep learning classifiers including LSTM-CNN performed better than the traditional classifiers with an AUC of 0.99 (95% CI 0.98-0.99) for distinguishing commercial from noncommercial tweets, and BiLSTM performed the best with an AUC of 0.83 (95% CI 0.78-0.89) for provape sentiment. Overall, LSTM-CNN performed the best across all 3 classification tasks.</p>
        </sec>
        <sec sec-type="conclusions">
          <title>Conclusions</title>
          <p>We derived and evaluated traditional machine learning and deep learning classifiers to identify vaping-related relevant, commercial, and provape tweets. Overall, deep learning classifiers such as LSTM-CNN had superior performance and had the added advantage of requiring no preprocessing. The performance of these classifiers supports the development of a vaping surveillance system.</p>
        </sec>
      </abstract>
      <kwd-group>
        <kwd>vaping</kwd>
        <kwd>social media</kwd>
        <kwd>infodemiology</kwd>
        <kwd>infoveillance</kwd>
        <kwd>machine learning</kwd>
        <kwd>deep learning</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <sec sec-type="introduction">
      <title>Introduction</title>
      <sec>
        <title>Background</title>
        <p>Machine learning methods provide a valuable framework for systematic and automated processing and analysis of data on social media platforms such as Twitter for developing surveillance systems with application to public health. The continuous generation of an enormous amount of content by a vast number of users allows for efficient real-time monitoring of sources of information and user sentiment if it can be automated. Furthermore, such monitoring can lead to the discovery of emergent patterns of information flow and changes in sentiments that may occur in response to public health and policy interventions. In this study, we derived and evaluated traditional machine learning and deep learning classifiers that can be used to build a Twitter-based surveillance system to identify and monitor vaping-related content and sentiments.</p>
      </sec>
      <sec>
        <title>Vaping and Public Health</title>
        <p>Vaping is the inhalation of aerosols that often contain nicotine combined with flavorings where the aerosols are delivered through electronic delivery systems known as electronic cigarettes (e-cigarettes) or electronic vaporizers. Evidence suggests that vaping is safer than smoking tobacco and can help with successful smoking cessation [<xref ref-type="bibr" rid="ref1">1</xref>]. However, emerging research indicates that vaping may cause cardiovascular and respiratory diseases and may pose health hazards from secondhand aerosol exposure [<xref ref-type="bibr" rid="ref2">2</xref>]. More recently, vaping has been associated with e-cigarette or vaping product use–associated lung injury, which has caused hospitalization and even death [<xref ref-type="bibr" rid="ref3">3</xref>,<xref ref-type="bibr" rid="ref4">4</xref>]. There is a rising concern that vaping increases addiction among nonsmokers, especially adolescents [<xref ref-type="bibr" rid="ref5">5</xref>], and many are unaware of the addictive potential until after they become nicotine dependent [<xref ref-type="bibr" rid="ref6">6</xref>]. Thus, there is a strong need to measure and understand the risks, sentiments, and behavior related to vaping.</p>
      </sec>
      <sec>
        <title>Surveillance Using Twitter</title>
        <p>Twitter is a popular social media platform that is widely used by adolescents, young adults, and racial and ethnic minorities, all of whom are disproportionately affected by vaping [<xref ref-type="bibr" rid="ref7">7</xref>-<xref ref-type="bibr" rid="ref9">9</xref>]. Communication on Twitter is by short succinct messages, called tweets, which are limited to 280 characters. Twitter is an open platform that enables users to see information and messages from other public users without special permission. This results in high potential exposure to each tweet, which enables systematic assessment by investigators. Furthermore, tweets heavily use hashtags (eg, #vapelife) as searchable text, which allows users to click on a linked word or phrase and navigate to other mentions of it [<xref ref-type="bibr" rid="ref10">10</xref>]. These factors make Twitter a relevant, valuable, and feasible social media platform to study.</p>
        <p>Infoveillance is the application of surveillance methods to internet-related and other electronic content to inform public health and public policy. Traditional surveys around attitudes and beliefs are too slow to optimally capture rapid changes. Infoveillance methods that use web-based data streams have proven to be more effective for several areas of public health. Investigators have used Twitter data for the infoveillance of topics such as pharmacovigilance, vaccine information, and tracking health conditions [<xref ref-type="bibr" rid="ref11">11</xref>-<xref ref-type="bibr" rid="ref13">13</xref>]. For example, such data have been useful in characterizing outbreaks of food-related illness and influenza, factors surrounding prescription drug abuse [<xref ref-type="bibr" rid="ref14">14</xref>], adverse drug events [<xref ref-type="bibr" rid="ref15">15</xref>], sentiment toward the use of tobacco [<xref ref-type="bibr" rid="ref16">16</xref>,<xref ref-type="bibr" rid="ref17">17</xref>], and use of alcohol [<xref ref-type="bibr" rid="ref18">18</xref>].</p>
      </sec>
      <sec>
        <title>Objective</title>
        <p>Our immediate objective was to derive and evaluate machine learning classifiers that can form the basis of a Twitter-based surveillance system that is focused on vaping-related tweets. Our ultimate goal is to use a surveillance system to assess key factors such as sentiment, marketing, procurement, health effects, and policy that will provide unique perspectives related to vaping. Furthermore, we plan to characterize changes over time in the volume of messaging related to vaping and other vaping-related characteristics of interest [<xref ref-type="bibr" rid="ref19">19</xref>,<xref ref-type="bibr" rid="ref20">20</xref>]. Leveraging Twitter as a complement to traditional surveillance will allow for real-time identification of changes that can be used by public health practitioners. For example, when positive sentiment toward vaping rises, practitioners may be able to determine reasons for this and respond accordingly. Similarly, when there is a notable spike in misinformation about vaping and health effects, they will be able to act immediately to correct this information. As a step toward the development of a Twitter-based vaping surveillance system, we derived machine learning classifiers to automatically identify tweets that are vaping-related, are noncommercial, and express provape sentiments. Using a data set of manually annotated tweets and a larger data set of unannotated tweets, we derived and evaluated traditional machine learning and deep learning classifiers.</p>
      </sec>
      <sec>
        <title>Related Work</title>
        <p>Natural language processing, classification, and sentiment analysis of Twitter data are more taxing than other kinds of text because of the limited length of the tweets. As tweets are limited to 280 characters and the language used is informal, the messages are interspersed with abbreviations, slang, Twitter-specific terms such as usernames and hashtags, and URLs.</p>
        <p>Several investigators have derived classifiers using Twitter data in the context of vaping. For example, Han and Kavuluru [<xref ref-type="bibr" rid="ref21">21</xref>] implemented support vector machines, logistic regression, and convolutional neural networks to identify marketing and nonmarketing e-cigarette tweets. Myslin et al [<xref ref-type="bibr" rid="ref17">17</xref>] and Cole-Lewis et al [<xref ref-type="bibr" rid="ref22">22</xref>] annotated tobacco-related tweets and derived several machine learning classifiers to predict sentiment. Huang et al [<xref ref-type="bibr" rid="ref23">23</xref>] analyzed tweets using classifiers and found that tweets related to e-cigarettes were about 90% commercial and about 10% mentioned smoking cessation. Resende and Culotta [<xref ref-type="bibr" rid="ref24">24</xref>] derived a sentiment classifier for e-cigarette–related tweets that identified positive and negative tweets with 96% and 70% precision, respectively.</p>
        <p>Compared with prior work, the main contributions of this paper are (1) exploration of a large range of classifiers, including deep learning classifiers; and (2) analysis of highly relevant features in classifiers using an algorithm that provides a unified approach to explain the output of any classifier.</p>
      </sec>
    </sec>
    <sec sec-type="methods">
      <title>Methods</title>
      <sec>
        <title>Data Collection</title>
        <p>Primary data were collected from the Twitter application programming interface using the open-source, real-time infoveillance of Twitter health messages (RITHM) software [<xref ref-type="bibr" rid="ref19">19</xref>]. RITHM allows for the real-time collection of all publicly available tweets matching a specified set of keywords. We identified and collected all tweets that matched one or more keywords that are indicative of vaping-related tweets. The keywords that we used for data collection included <italic>vape, vapes, vaper, vapers, vaping, juul, juuls,</italic> and <italic>juuling</italic>. The vaping-related keywords are based on previous Twitter research [<xref ref-type="bibr" rid="ref10">10</xref>], and, in particular, we included keywords to identify the highly popular JUUL e-cigarette [<xref ref-type="bibr" rid="ref6">6</xref>].</p>
        <sec>
          <title>Data Set for Annotation</title>
          <p>We continuously collected all publicly available tweets that matched vaping-related keywords over 2 months from August 17, 2018, to October 19, 2018. This resulted in a data set of 1,892,722 tweets. From this data set, we removed <italic>retweets</italic> (rebroadcasted messages without original content), and from the remaining original 810,600 tweets, we randomly selected a subset of 4000 tweets for manual double coding and adjudication. The removal of retweets and the random selection ensured that the tweet content was lexically diverse and sufficiently representative of tweets related to vaping. This particular period was chosen as it also included salient health policy events related to vaping. In particular, the US Food and Drug Administration (FDA) sent warning letters to retailers and manufacturers (September 12, 2018) and seized documents from JUUL headquarters (October 05, 2018). In previous studies, data sets of 4000 to 7000 tweets have been adequate for the derivation of classifiers [<xref ref-type="bibr" rid="ref17">17</xref>,<xref ref-type="bibr" rid="ref25">25</xref>,<xref ref-type="bibr" rid="ref26">26</xref>].</p>
        </sec>
        <sec>
          <title>Unannotated Data Set for Deriving Word Vectors</title>
          <p>Word vectors, also known as word embeddings, are derived from a large data set of text to capture semantic and syntactic similarity and context of each word as a vector of real numbers. Word vectors have become popular because they can improve the performance of deep learning classifiers and can reduce the volume of annotations that are needed. Word vectors have the advantage that they do not require annotations; instead, they leverage a large amount of unannotated data.</p>
          <p>We continuously collected all publicly available tweets that matched vaping-related keywords over 7 months from January 01, 2018 to July 31, 2018. This resulted in a data set of 4,078,343 tweets, and from this data set, we removed retweets to obtain a set of 1,899,851 original tweets. We used this set to derive word vectors for deep learning. The period of data selected for word vectors represents 7 months of continuous data collection and provided a sufficiently large set of tweets for deriving word vectors and simultaneously ensuring that relevant context from the tweets, in terms of language and topical diversity, is captured in the word vectors. The period of data selected for word vectors was before the period of data selected for annotations, with no overlap, as a part of the annotated set was used for the evaluation of the classifiers.</p>
        </sec>
      </sec>
      <sec>
        <title>Annotation</title>
        <p>We developed a three-level hierarchical annotation schema, as shown in <xref rid="figure1" ref-type="fig">Figure 1</xref>. Descriptions of the labels used for annotation are provided in <xref ref-type="table" rid="table1">Table 1</xref>. The annotation procedure consisted of first annotating a tweet as vape relevant or not based on the content. A relevant tweet was further annotated as commercial or noncommercial, and a noncommercial tweet was further annotated for provape or not provape sentiments. A similar three-level hierarchical annotation schema has been used for annotating vaccination-related tweets. At the first level, a tweet is annotated as relevant or not; at the next level, only a relevant tweet is annotated as positive, negative, or neutral; and at the final level, only a negative tweet is annotated based on safety, efficacy, cost, etc [<xref ref-type="bibr" rid="ref25">25</xref>,<xref ref-type="bibr" rid="ref26">26</xref>]. A hierarchical annotation schema has the advantage that all tweets need not be annotated on all possible levels, thus allowing for a reduction in annotation effort. For example, nonrelevant tweets need not be annotated further, and relevant and commercial tweets need not be annotated further.</p>
        <p>Trained annotators independently annotated 4000 tweets in batches of 100 to 200 and adjudicated annotation disagreements in the presence of a supervising investigator. Annotators considered tweet content that included both primary and secondary text (ie, quoted tweets within primary tweets). Furthermore, annotators had access to Twitter’s native platform, where they could review the context of potentially confusing content. Cohen κ coefficient was used to assess interrater agreement [<xref ref-type="bibr" rid="ref27">27</xref>] before adjudication and at regular intervals throughout the process. Initial κ coefficients were relatively modest (eg, κ=0.54 for relevance), but improved as annotators gained familiarity with the data and the domain. The κ coefficients for the final round of annotation (n=100) were 0.71 for relevance, 0.89 for commercial, and 0.70 for provape. Fully adjudicated annotations and tweet content including metadata were used for machine learning.</p>
        <fig id="figure1" position="float">
          <label>Figure 1</label>
          <caption>
            <p>A hierarchical annotation scheme for vaping-related tweets.</p>
          </caption>
          <graphic xlink:href="jmir_v22i8e17478_fig1.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <table-wrap position="float" id="table1">
          <label>Table 1</label>
          <caption>
            <p>Descriptions of labels used for annotating vaping-related tweets.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="150"/>
            <col width="850"/>
            <thead>
              <tr valign="top">
                <td>Labels</td>
                <td>Descriptions</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>Relevant</td>
                <td>Is the tweet in English and related to the vaping topic at hand (eg, vape use or users, vaping devices, or products)?</td>
              </tr>
              <tr valign="top">
                <td>Not relevant</td>
                <td>Tweets categorized as not relevant were typically in non-English or had referenced vaping cannabis products specifically, such as:<break/><list list-type="bullet"><list-item><p>“Teens are smoking, vaping and eating cannabis”</p></list-item><list-item><p>“What if I vape weed?”</p></list-item></list></td>
              </tr>
              <tr valign="top">
                <td>Commercial</td>
                <td>Is the tweet selling, marketing, or advertising vaping products?</td>
              </tr>
              <tr valign="top">
                <td>Noncommercial</td>
                <td>Includes tweets that demonstrate favorability toward a product but do not directly advocate for purchasing it.</td>
              </tr>
              <tr valign="top">
                <td>Provape</td>
                <td>Is vaping associated with positive emotions or contexts? Such as:<break/><list list-type="bullet"><list-item><p>The tweet author is currently using, has recently used, or intends to use a vape product.</p></list-item><list-item><p>The tweet author indicates acceptance of others’ vaping or favorability toward others’ positive perspectives of vaping.</p></list-item><list-item><p>The tweet author mentions vaping in association with other positive aspects of society or popular culture (eg, partying, sexuality, popularity, and attractiveness).</p></list-item></list></td>
              </tr>
              <tr valign="top">
                <td>Not provape</td>
                <td>Includes tweets that are antivape, neutral or fact based, or without subjective judgment about positive or acceptable aspects of vaping.</td>
              </tr>
            </tbody>
          </table>
        </table-wrap>
      </sec>
      <sec>
        <title>Machine Learning</title>
        <p>In this section, we describe the steps in machine learning that consist of preprocessing, derivation of features, and training of classifiers.</p>
        <sec>
          <title>Preprocessing and Vector Representation for Traditional Classifiers</title>
          <p>Twitter data consist of tweet metadata and tweet content. Metadata includes information related to the user’s profile (such as location, number of followers, number of friends, and tweeting frequency), information related to a tweet’s status (such as the location of the tweet), media object contained in the tweet (such as audio, video, and image), and if the tweet was in reply to another tweet. As tweets are restricted to 280 characters, their content has, in addition to the standard text, abbreviations, usernames (that are annotated with the @), hashtags (topic tags annotated with the <italic>#</italic>), Unicode characters, URLs (typically shortened pseudorandom short URLs), and emojis (icons used to express an idea or emotion). Before preprocessing, we replaced usernames, hashtags, Unicode characters, and URLs with the textual placeholders _mention_, _hashtag_, _unicode_, and _url_, respectively. We also translated emojis into textual descriptions for better interpretability. This standardized text representation of tweets ensured that the preprocessing pipeline needed to handle only text.</p>
          <p>The preprocessing pipeline consisted of 10 steps, including removal of textual placeholders (for usernames, hashtags, Unicode characters, and URLs), removal of textual descriptions of emojis, expansion of negations, removal of punctuation and digits, negation marking, normalization, stemming, removal of stopwords, and conversion to lowercase (<xref ref-type="table" rid="table2">Table 2</xref>).</p>
          <p>After preprocessing, we created 2 types of tweet representations that are useful for machine learning. In the first representation, called the vector count representation, we identify unique words in the tweet data set and represent each tweet with a vector of numbers, where a number denotes the frequency (count) of the occurrence of a unique word in the tweet. Thus, each tweet is represented by a vector that contains as many counts as the number of unique words. We also investigated an alternative vector representation called frequency-inverse document frequency (TF-IDF) where the number assigned to a word in the vector depends not only on its frequency in a tweet but also on its frequency in the entire data set. In this representation, words that occur in the majority of the tweets are considered to be of lower importance than words that occur more rarely. As preliminary results did not demonstrate improved performance with the TF-IDF representation, we did not perform extensive experiments with this representation.</p>
          <p>Rather than applying the same set of preprocessing steps to every classifier, we searched all possible combinations of the 10 preprocessing steps for each classifier and identified the optimal set of preprocessing steps that gave the best classifier performance (<xref ref-type="table" rid="table2">Table 2</xref>).</p>
          <table-wrap position="float" id="table2">
            <label>Table 2</label>
            <caption>
              <p>Description of preprocessing steps and options used in traditional classifiers.</p>
            </caption>
            <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
              <col width="150"/>
              <col width="750"/>
              <col width="100"/>
              <thead>
                <tr valign="top">
                  <td>Preprocessing steps</td>
                  <td>Descriptions</td>
                  <td>Options<sup>a</sup></td>
                </tr>
              </thead>
              <tbody>
                <tr valign="top">
                  <td>placeholder_remove</td>
                  <td>Remove textual placeholders such as _mention_, _hashtag_, _unicode_, and _url_</td>
                  <td>True, false</td>
                </tr>
                <tr valign="top">
                  <td>emoji_remove</td>
                  <td>Remove textual descriptions that denote emojis</td>
                  <td>True, false</td>
                </tr>
                <tr valign="top">
                  <td>negation_expand</td>
                  <td>Expand negative contractions, for example, “don’t” is expanded to “do not” and “can’t” is expanded to “cannot”</td>
                  <td>True, false</td>
                </tr>
                <tr valign="top">
                  <td>punctuation_remove</td>
                  <td>Remove all punctuation symbols</td>
                  <td>True, false</td>
                </tr>
                <tr valign="top">
                  <td>digits_remove</td>
                  <td>Remove all numeric digits (0-9)</td>
                  <td>True, false</td>
                </tr>
                <tr valign="top">
                  <td>negation_mark</td>
                  <td>Mark words that occur between a negation trigger and a punctuation mark with the NEG prefix [<xref ref-type="bibr" rid="ref28">28</xref>]</td>
                  <td>True, false</td>
                </tr>
                <tr valign="top">
                  <td>normalize</td>
                  <td>Reduce to 2 characters all consecutive characters that appear more than twice, for example, “happppy” is reduced to “happy”</td>
                  <td>True, false</td>
                </tr>
                <tr valign="top">
                  <td>stemming</td>
                  <td>Reduce inflection in words (eg, troubled, troubles) to their root form (eg, trouble) using the Porter Stemmer [<xref ref-type="bibr" rid="ref29">29</xref>]</td>
                  <td>True, false</td>
                </tr>
                <tr valign="top">
                  <td>stopwords_remove</td>
                  <td>Remove common words such as “the,” “a,” “on,” “is,” and “all” that are listed in the Natural Language Toolkit English stop words list [<xref ref-type="bibr" rid="ref30">30</xref>]</td>
                  <td>True, false</td>
                </tr>
                <tr valign="top">
                  <td>lowercase</td>
                  <td>Change the case of all characters to lowercase</td>
                  <td>True, false</td>
                </tr>
              </tbody>
            </table>
            <table-wrap-foot>
              <fn id="table2fn1">
                <p><sup>a</sup>If the option for a step is set to <italic>true</italic>, the corresponding preprocessing step will be applied in the preprocessing pipeline; if the option is set to <italic>false</italic>, the corresponding preprocessing step will be skipped in the pipeline.</p>
              </fn>
            </table-wrap-foot>
          </table-wrap>
        </sec>
        <sec>
          <title>Preprocessing and Vector Representation for Deep Learning Classifiers</title>
          <p>For the deep learning classifiers, we used 2 alternative preprocessing methods: (1) a fixed preprocessing pipeline and (2) no preprocessing. The fixed preprocessing pipeline consisted of the following 5 steps (out of the possible 10 steps listed in <xref ref-type="table" rid="table2">Table 2</xref>): removal of textual placeholders, expansion of negations, removal of punctuation and digits, and conversion to lowercase. In contrast to vector count representation, which is used in traditional classifiers where a tweet is denoted by a vector of counts, in the deep learning classifiers, each word in a tweet is denoted by a word vector as described next, and each tweet is denoted by a vector of word vectors.</p>
        </sec>
        <sec>
          <title>Word Vectors</title>
          <p>Word vectors are derived from large unannotated tweet data (or other types of text data) and are increasingly used in deep learning classifiers. A word vector represents a word (not an entire tweet as in vector count representation) as a vector of numbers such that 2 words are considered to be similar in meaning if their vectors are close to each other mathematically. Word vectors capture the meaning and usage of words and are derived from patterns of how words co-occur in a large data set of tweets.</p>
          <p>We investigated the performance of word vectors from 2 types of tweet data. First, we used word vectors that are derived from a large data set of tweets of all kinds; we call these vectors general or nondomain-specific word vectors. For general word vectors, we downloaded the 200-dimension Global Vectors for Word Representation (GloVe) word vectors. The GloVe vectors were derived from 2 billion tweets of all kinds, and each word was represented by a vector of size 200 [<xref ref-type="bibr" rid="ref31">31</xref>]. Second, we used word vectors from a large data set of vaping-related tweets; we call these vectors vaping-related word vectors. We created vaping-related word vectors from a data set of tweets that were collected over 7 months from January 01, 2018 to July 31, 2018 using the vaping-related keywords. This data set contained 1,899,851 original tweets, and we used the Word2Vec algorithm [<xref ref-type="bibr" rid="ref32">32</xref>] to derive 300-dimension word vectors (additional settings for the Word2Vec algorithm included a window size of 2 and 30 epochs).</p>
        </sec>
        <sec>
          <title>Machine Learning Methods</title>
          <p>We derived and evaluated 2 families of classifiers. The traditional classifiers included logistic regression (LR), random forest (RF), linear support vector machine (SVM), and multinomial naive Bayes (NB), and we used the implementations of these classifiers in scikit-learn version 0.23.1 [<xref ref-type="bibr" rid="ref33">33</xref>]. The deep learning classifiers included convolutional neural network (CNN), long short-term memory (LSTM) network, combined LSTM and CNN (LSTM-CNN), and bidirectional LSTM (BiLSTM) network, and we used the implementations of these classifiers in Keras version 2.2.4 [<xref ref-type="bibr" rid="ref34">34</xref>].</p>
          <p>In contrast to traditional classifiers, CNNs automatically select words in tweets that are relevant. The LSTM network is a type of neural network that captures patterns of words in tweets. Conventional LSTM networks capture patterns in a single direction, from left to right, whereas BiLSTM networks capture patterns in both directions, from left to right and from right to left. Both LSTM and BiLSTM have demonstrated good performance on social media data [<xref ref-type="bibr" rid="ref35">35</xref>,<xref ref-type="bibr" rid="ref36">36</xref>], and compared with CNNs, they can handle the variable lengths of tweets. The LSTM-CNN networks combine the advantages of CNNs and LSTM networks.</p>
          <p>We derived and evaluated separate classifiers for 3 different tasks, that is, to identify which tweets are relevant, are noncommercial, and contain provape sentiment. For these tasks, the 3 binary targets and their corresponding values are (1) relevance: relevant (positive value) versus nonrelevant (negative value), (2) commercial: commercial (positive value) versus noncommercial (negative value), and (3) sentiment: provape (positive value) versus not provape (negative value).</p>
        </sec>
        <sec>
          <title>Experimental Methods</title>
          <p>From the annotated data set of 4000 tweets, we created 3 data sets to predict relevance, commercial, and sentiment that contained 4000, 3011, and 2175 tweets, respectively (<xref ref-type="table" rid="table3">Table 3</xref>). Each data set was randomly split into training and test sets (90:10 splits) such that the sets contained the same proportion of positive targets. A total of 3600, 2709, and 1957 tweets were used in the training data sets to derive relevance, commercial, and sentiment classifiers, respectively (<xref ref-type="table" rid="table3">Table 3</xref>). We used the training set to derive the best classifier (including the selection of hyperparameters if needed) for each type of classifier. The test data sets that were used to evaluate the relevance, commercial, and sentiment classifiers included 400, 302, and 218 tweets, respectively (<xref ref-type="table" rid="table3">Table 3</xref>).</p>
          <p><xref ref-type="table" rid="table4">Table 4</xref> shows the traditional classifiers with parameter settings that we used in our experiments, and <xref ref-type="table" rid="table5">Table 5</xref> shows the parameter settings of the deep learning classifiers that we used in our experiments.</p>
          <table-wrap position="float" id="table3">
            <label>Table 3</label>
            <caption>
              <p>Description of training and test data sets.</p>
            </caption>
            <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
              <col width="120"/>
              <col width="240"/>
              <col width="320"/>
              <col width="320"/>
              <thead>
                <tr valign="top">
                  <td>Targets</td>
                  <td>Total number of tweets, n (%)</td>
                  <td>Number of tweets with positive target, n (%)</td>
                  <td>Number of tweets with negative target, n (%)</td>
                </tr>
              </thead>
              <tbody>
                <tr valign="top">
                  <td>Relevance</td>
                  <td>
                   ⁠    <break/>
                    <list list-type="bullet"><list-item><p>Total: 4000 (100)</p></list-item><list-item><p>Training: 3600 (100)</p></list-item><list-item><p>Test: 400 (100)</p></list-item></list>
                    </td>
                  <td>Relevant<break/><list list-type="bullet"><list-item><p>Total: 3011 (75.28)</p></list-item><list-item><p>Training: 2709 (75.25)</p></list-item><list-item><p>Test: 302 (75.5)</p></list-item></list></td>
                  <td>Nonrelevant<break/><list list-type="bullet"><list-item><p>Total: 989 (24.72)</p></list-item><list-item><p>Training: 891 (24.75)</p></list-item><list-item><p>Test: 98 (24.5)</p></list-item></list></td>
                </tr>
                <tr valign="top">
                  <td>Commercial</td>
                  <td>
                       ⁠    <break/>
                    <list list-type="bullet"><list-item><p>Total: 3011 (100)</p></list-item><list-item><p>Training: 2709 (100)</p></list-item><list-item><p>Test: 302 (100)</p></list-item></list>
                  </td>
                  <td>Noncommercial<break/><list list-type="bullet"><list-item><p>Total: 2175 (72.24)</p></list-item><list-item><p>Training: 1957 (72.24)</p></list-item><list-item><p>Test: 218 (72.2)</p></list-item></list></td>
                  <td>Commercial<break/><list list-type="bullet"><list-item><p>Total: 836 (27.76)</p></list-item><list-item><p>Training: 752 (27.86)</p></list-item><list-item><p>Test: 84 (27.8)</p></list-item></list></td>
                </tr>
                <tr valign="top">
                  <td>Sentiment</td>
                  <td>⁠    <break/><list list-type="bullet"><list-item><p>Total: 2175 (100)</p></list-item><list-item><p>Training: 1957 (100)</p></list-item><list-item><p>Test: 218 (100)</p></list-item></list></td>
                  <td>Provape<break/><list list-type="bullet"><list-item><p>Total: 1357 (62.39)</p></list-item><list-item><p>Training: 1221 (62.39)</p></list-item><list-item><p>Test: 136 (62.4)</p></list-item></list></td>
                  <td>Not provape<break/><list list-type="bullet"><list-item><p>Total: 818 (37.61)</p></list-item><list-item><p>Training: 736 (37.61)</p></list-item><list-item><p>Test: 82 (37.6)</p></list-item></list></td>
                </tr>
              </tbody>
            </table>
          </table-wrap>
          <table-wrap position="float" id="table4">
            <label>Table 4</label>
            <caption>
              <p>Description of traditional classifiers and parameter settings used in the experiments (the same parameter settings were used for the following 3 targets: relevance, commercial, and sentiment).</p>
            </caption>
            <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
              <col width="200"/>
              <col width="460"/>
              <col width="340"/>
              <thead>
                <tr valign="top">
                  <td>Classifiers</td>
                  <td>Scikit-learn functions (version)</td>
                  <td>Parameter values</td>
                </tr>
              </thead>
              <tbody>
                <tr valign="top">
                  <td>Logistic regression</td>
                  <td>sklearn.linear_model.LogisticRegression (0.20.3)</td>
                  <td>All default values except C=0.001</td>
                </tr>
                <tr valign="top">
                  <td>Random forest</td>
                  <td>sklearn.ensemble.RandomForestClassifier (0.20.3)</td>
                  <td>All default values except max_features=“sqrt”</td>
                </tr>
                <tr valign="top">
                  <td>Support vector machine</td>
                  <td>sklearn.linear_model.SGDClassifier (0.20.3)</td>
                  <td>All default values except α=.01</td>
                </tr>
                <tr valign="top">
                  <td>Naive Bayes</td>
                  <td>sklearn.naive_bayes.MultinomialNB (0.20.3)</td>
                  <td>All default values</td>
                </tr>
              </tbody>
            </table>
          </table-wrap>
          <table-wrap position="float" id="table5">
            <label>Table 5</label>
            <caption>
              <p>Description of deep learning classifiers, target, and parameter settings used in the experiments.</p>
            </caption>
            <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
              <col width="30"/>
              <col width="150"/>
              <col width="120"/>
              <col width="700"/>
              <thead>
                <tr valign="top">
                  <td colspan="2">Deep learning classifiers</td>
                  <td>Targets</td>
                  <td>Parameter values</td>
                </tr>
              </thead>
              <tbody>
                <tr valign="top">
                  <td colspan="4">
                    <bold>Vaping-related word vectors</bold>
                  </td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>CNN<sup>a</sup></td>
                  <td>Relevance</td>
                  <td>max_features: 166,395, embed_size: 300, max_len: 75, optimizer: rmsprop, filters: 100, kernel_size: 1, epochs: 5, batch_size: 16</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>LSTM<sup>b</sup></td>
                  <td>Relevance</td>
                  <td>max_features: 166,395, embed_size: 300, max_len: 75, optimizer: adam, epochs: 10, batch_size: 16</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>LSTM-CNN</td>
                  <td>Relevance</td>
                  <td>max_features: 166,395, embed_size: 300, max_len: 75, optimizer: adam, filters: 50, kernel_size: 2, epochs: 10, batch_size: 16</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>BiLSTM<sup>c</sup></td>
                  <td>Relevance</td>
                  <td>max_features: 166,395, embed_size: 300, max_len: 75, optimizer: adam, epochs: 10, batch_size: 16</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>CNN</td>
                  <td>Commercial</td>
                  <td>max_features: 166,395, embed_size: 300, max_len: 75, optimizer: adam, filters: 100, kernel_size: 2, epochs: 10, batch_size: 16</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>LSTM</td>
                  <td>Commercial</td>
                  <td>max_features: 166,395, embed_size: 300, max_len: 75, optimizer: rmsprop, epochs: 5, batch_size: 32</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>LSTM-CNN</td>
                  <td>Commercial</td>
                  <td>max_features: 166,395, embed_size: 300, max_len: 75, optimizer: rmsprop, filters: 75, kernel_size: 2, epochs: 5, batch_size: 16</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>BiLSTM</td>
                  <td>Commercial</td>
                  <td>max_features: 166,395, embed_size: 300, max_len: 75, optimizer: adam, epochs: 5, batch_size: 64</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>CNN</td>
                  <td>Sentiment</td>
                  <td>max_features: 166,395, embed_size: 300, max_len: 75, optimizer: rmsprop, filters: 100, kernel_size: 2, epochs: 10, batch_size: 32</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>LSTM</td>
                  <td>Sentiment</td>
                  <td>max_features: 166,395, embed_size: 300, max_len: 75, optimizer: adam, epochs: 5, batch_size: 64</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>LSTM-CNN</td>
                  <td>Sentiment</td>
                  <td>max_features: 166,395, embed_size: 300, max_len: 75, optimizer: adam, filters: 75, kernel_size: 3, epochs: 5, batch_size: 64</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>BiLSTM</td>
                  <td>Sentiment</td>
                  <td>max_features: 166,395, embed_size: 300, max_len: 75, optimizer: rmsprop, epochs: 5, batch_size: 32</td>
                </tr>
                <tr valign="top">
                  <td colspan="4">
                    <bold>Global Vectors for Word Representation</bold>
                    <bold>word vectors</bold>
                  </td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>CNN</td>
                  <td>Relevance</td>
                  <td>max_features: 15,890, embed_size: 200, max_len: 75, optimizer: adam, filters: 100, kernel_size: 2, epochs: 10, batch_size: 16</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>LSTM</td>
                  <td>Relevance</td>
                  <td>max_features: 15,890, embed_size: 200, max_len: 75, optimizer: adam, epochs: 5, batch_size: 32</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>LSTM-CNN</td>
                  <td>Relevance</td>
                  <td>max_features: 15,890, embed_size: 200, max_len: 75, optimizer: adam, filters: 50, kernel_size: 2, epochs: 10, batch_size: 16</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>BiLSTM</td>
                  <td>Relevance</td>
                  <td>max_features: 15,890, embed_size: 200, max_len: 75, optimizer: adam, epochs: 5, batch_size: 64</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>CNN</td>
                  <td>Commercial</td>
                  <td>max_features: 10,842, embed_size: 200, max_len: 75, optimizer: rmsprop, filters: 50, kernel_size: 2, epochs: 5, batch_size: 16</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>LSTM</td>
                  <td>Commercial</td>
                  <td>max_features: 10,842, embed_size: 200, max_len: 75, optimizer: adam, epochs: 5, batch_size: 16</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>LSTM-CNN</td>
                  <td>Commercial</td>
                  <td>max_features: 10,842, embed_size: 200, max_len: 75, optimizer: adam, filters: 75, kernel_size: 2, epochs: 5, batch_size: 32</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>BiLSTM</td>
                  <td>Commercial</td>
                  <td>max_features: 10,842, embed_size: 200, max_len: 75, optimizer: adam, epochs: 5, batch_size: 64</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>CNN</td>
                  <td>Sentiment</td>
                  <td>max_features: 7979, embed_size: 200, max_len: 75, optimizer: rmsprop, filters: 100, kernel_size: 3, epochs: 5, batch_size: 64</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>LSTM</td>
                  <td>Sentiment</td>
                  <td>max_features: 7979, embed_size: 200, max_len: 75, optimizer: adam, epochs: 5, batch_size: 32</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>LSTM-CNN</td>
                  <td>Sentiment</td>
                  <td>max_features: 7979, embed_size: 200, max_len: 75, optimizer: rmsprop, filters: 75, kernel_size: 1, epochs: 10, batch_size: 64</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>BiLSTM</td>
                  <td>Sentiment</td>
                  <td>max_features: 7979, embed_size: 200, max_len: 75, optimizer: adam, epochs: 5, batch_size: 32</td>
                </tr>
              </tbody>
            </table>
            <table-wrap-foot>
              <fn id="table5fn1">
                <p><sup>a</sup>CNN: convolutional neural network.</p>
              </fn>
              <fn id="table5fn2">
                <p><sup>b</sup>LSTM: long short-term memory.</p>
              </fn>
              <fn id="table5fn3">
                <p><sup>c</sup>BiLSTM: bidirectional long short-term memory.</p>
              </fn>
            </table-wrap-foot>
          </table-wrap>
        </sec>
        <sec>
          <title>Evaluation of Classifier Performance</title>
          <p>We assessed the performance of the classifiers with the area under the receiver operating characteristic curve (AUC), precision, recall, and F1 scores. The AUC is a measure of discrimination, that is, how well a classifier differentiates between the positive and negative tweets, and larger values indicate better performance. Precision is the number of correctly classified positive tweets divided by the number of all positive tweets returned by the classifier, and recall is the number of correctly classified positive tweets divided by the number of all positive tweets. The F1 score is the harmonic average of the precision and recall; the F1 score achieves the best value at 1 when both precision and recall are perfect and the worst value at 0.</p>
        </sec>
        <sec>
          <title>Evaluation of Relevance</title>
          <p>To identify relevant words (features) in each classifier, we applied SHapley Additive exPlanations (SHAP), which is an algorithm for interpreting the relevance of features used in classifiers [<xref ref-type="bibr" rid="ref37">37</xref>]. SHAP assigns each feature an average relevance value based on predictions on a data set. We examined the top 10 ranked features for each classifier.</p>
        </sec>
      </sec>
    </sec>
    <sec sec-type="results">
      <title>Results</title>
      <sec>
        <title>Proportions of Tweet Categories in the Annotated Data Set</title>
        <p>In the annotated data set, 75.28% were relevant to vaping, and of the vaping-relevant tweets, 72.24% were of a noncommercial  nature. Of the noncommercial vaping-relevant tweets, 62.39% contained provape sentiments.</p>
      </sec>
      <sec>
        <title>Performance of Classifiers</title>
        <sec>
          <title>Relevance Classifiers</title>
          <p>Application of traditional classifiers yielded AUC values of 0.84 to 0.95, application of deep learning classifiers with vaping-related word vectors yielded AUC values of 0.90 to 0.93, and application of deep learning classifiers with GloVe word vectors yielded AUC values of 0.93 to 0.96. LR had the highest recall, whereas RF and the deep learning classifiers with GloVe word vectors had the highest F1 value. LSTM-CNN with GloVe word vectors performed the best overall with the highest AUC and precision values.</p>
        </sec>
        <sec>
          <title>Commercial Classifiers</title>
          <p>Overall, the AUC values were similar across all classifiers. Application of traditional classifiers yielded AUC values of 0.96 to 0.98, application of deep learning classifiers with vaping-related word vectors yielded AUC values of 0.97 to 0.98, and application of deep learning classifiers with GloVe word vectors yielded AUC values of 0.99. LSTM-CNN and BiLSTM with GloVe word vectors performed the best overall with the highest AUC, precision, recall, and F1 values.</p>
        </sec>
        <sec>
          <title>Sentiment Classifiers</title>
          <p>Application of traditional classifiers yielded AUC values of 0.69 to 0.78, application of deep learning classifiers with vaping-related word vectors yielded AUC values of 0.74 to 0.75, and application of deep learning classifiers with GloVe word vectors yielded AUC values of 0.78 to 0.83. BiLSTM and LSTM-CNN with GloVe word vectors performed the best overall with the highest AUC, precision, and F1 values.</p>
        </sec>
        <sec>
          <title>Preprocessing</title>
          <p>Our experiments showed that some traditional classifiers performed best with minimal preprocessing compared with others. LR and NB did not use any of the 10 preprocessing steps for any of the 3 targets (<xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). On the other hand, RF and SVM used 5 preprocessing steps on average (<xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). The deep learning classifiers performed better with no preprocessing compared with the fixed preprocessing pipeline. Furthermore, in addition to the standard text in tweets, information such as URLs, usernames, hashtags, and Unicode characters was found to be important and was included in most of the classifiers.</p>
        </sec>
      </sec>
      <sec>
        <title>Feature Relevance</title>
        <p>We applied the SHAP algorithm to the 12 classifiers for each target (corresponding to the classifiers in <xref ref-type="table" rid="table6">Tables 6,</xref> <xref ref-type="table" rid="table7">7</xref>, and <xref ref-type="table" rid="table8">8</xref>) to generate 10 top-ranked features. The feature relevance plots for each classifier and target are shown in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>. The word <italic>vape</italic> and its variations <italic>vapes</italic>, <italic>vaping</italic>, or <italic>vapelife</italic> appear in the 10 top-ranked features in all classifiers except RF relevance and commercial classifiers. Several textual placeholders appear in traditional classifiers, whereas several Unicode characters representing emojis appear in the deep learning classifiers. Interestingly, common simple words such as <italic>we</italic>, <italic>as</italic>, <italic>was</italic>, and <italic>no</italic> appear in many classifiers.</p>
        <table-wrap position="float" id="table6">
          <label>Table 6</label>
          <caption>
            <p>Performance of relevance classifiers.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="400"/>
            <col width="300"/>
            <col width="100"/>
            <col width="100"/>
            <col width="100"/>
            <thead>
              <tr valign="top">
                <td>Classifiers</td>
                <td>Area under the receiver operating <break/>characteristic curve (95% CI)</td>
                <td>Precision</td>
                <td>Recall</td>
                <td>F1</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>Logistic regression</td>
                <td>0.84 (0.78-0.89)</td>
                <td>0.80</td>
                <td>1.00</td>
                <td>0.92</td>
              </tr>
              <tr valign="top">
                <td>Random forest</td>
                <td>0.95 (0.93-0.98)</td>
                <td>0.93</td>
                <td>0.97</td>
                <td>
                  <italic>0.98</italic>
                </td>
              </tr>
              <tr valign="top">
                <td>Support vector machine</td>
                <td>0.92 (0.88-0.96)</td>
                <td>0.91</td>
                <td>0.97</td>
                <td>0.95</td>
              </tr>
              <tr valign="top">
                <td>Naive Bayes</td>
                <td>0.88 (0.83-0.93)</td>
                <td>0.88</td>
                <td>0.99</td>
                <td>0.93</td>
              </tr>
              <tr valign="top">
                <td>CNN<sup>a</sup> (vaping-related word vectors)</td>
                <td>0.94 (0.91-0.97)</td>
                <td>0.90</td>
                <td>0.97</td>
                <td>0.98</td>
              </tr>
              <tr valign="top">
                <td>LSTM<sup>b</sup> (vaping-related word vectors)</td>
                <td>0.91 (0.88-0.95)</td>
                <td>0.89</td>
                <td>0.98</td>
                <td>0.96</td>
              </tr>
              <tr valign="top">
                <td>LSTM-CNN (vaping-related word vectors)</td>
                <td>0.89 (0.85-0.93)</td>
                <td>0.93</td>
                <td>0.87</td>
                <td>0.95</td>
              </tr>
              <tr valign="top">
                <td>BiLSTM<sup>c</sup> (vaping-related word vectors)</td>
                <td>0.89 (0.85-0.94)</td>
                <td>0.90</td>
                <td>0.96</td>
                <td>0.94</td>
              </tr>
              <tr valign="top">
                <td>CNN (GloVe<sup>d</sup> word vectors)</td>
                <td>0.95 (0.92-0.97)</td>
                <td>0.93</td>
                <td>0.95</td>
                <td>0.98</td>
              </tr>
              <tr valign="top">
                <td>LSTM (GloVe word vectors)</td>
                <td>0.95 (0.92-0.98)</td>
                <td>0.95</td>
                <td>0.95</td>
                <td>0.98</td>
              </tr>
              <tr valign="top">
                <td>LSTM-CNN (GloVe word vectors)</td>
                <td>0.96 (0.93-0.98)</td>
                <td>0.96</td>
                <td>0.93</td>
                <td>0.98</td>
              </tr>
              <tr valign="top">
                <td>BiLSTM (GloVe word vectors)</td>
                <td>0.95 (0.93-0.98)</td>
                <td>0.92</td>
                <td>0.96</td>
                <td>0.98</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table6fn1">
              <p><sup>a</sup>CNN: convolutional neural network.</p>
            </fn>
            <fn id="table6fn2">
              <p><sup>b</sup>LSTM: long short-term memory.</p>
            </fn>
            <fn id="table6fn3">
              <p><sup>c</sup>BiLSTM: bidirectional long short-term memory.</p>
            </fn>
            <fn id="table6fn4">
              <p><sup>d</sup>GloVe: Global Vectors for Word Representation.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <table-wrap position="float" id="table7">
          <label>Table 7</label>
          <caption>
            <p>Performance of commercial classifiers.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="400"/>
            <col width="300"/>
            <col width="100"/>
            <col width="100"/>
            <col width="100"/>
            <thead>
              <tr valign="top">
                <td>Classifiers</td>
                <td>Area under the receiver operating <break/> characteristic curve (95% CI)</td>
                <td>Precision</td>
                <td>Recall</td>
                <td>F1</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>Logistic regression</td>
                <td>0.98 (0.95-0.99)</td>
                <td>0.93</td>
                <td>0.83</td>
                <td>0.96</td>
              </tr>
              <tr valign="top">
                <td>Random forest</td>
                <td>0.97 (0.96-0.99)</td>
                <td>0.95</td>
                <td>0.82</td>
                <td>0.97</td>
              </tr>
              <tr valign="top">
                <td>Support vector machine</td>
                <td>0.98 (0.91-0.99)</td>
                <td>0.92</td>
                <td>0.86</td>
                <td>0.92</td>
              </tr>
              <tr valign="top">
                <td>Naive Bayes</td>
                <td>0.96 (0.94-0.99)</td>
                <td>0.83</td>
                <td>0.89</td>
                <td>0.92</td>
              </tr>
              <tr valign="top">
                <td>CNN<sup>a</sup> (vaping-related word vectors)</td>
                <td>0.98 (0.96-0.99)</td>
                <td>0.93</td>
                <td>0.75</td>
                <td>0.94</td>
              </tr>
              <tr valign="top">
                <td>LSTM<sup>b</sup> (vaping-related word vectors)</td>
                <td>0.97 (0.95-0.99)</td>
                <td>0.88</td>
                <td>0.81</td>
                <td>0.94</td>
              </tr>
              <tr valign="top">
                <td>LSTM-CNN (vaping-related word vectors)</td>
                <td>0.97 (0.94-0.99)</td>
                <td>0.92</td>
                <td>0.85</td>
                <td>0.94</td>
              </tr>
              <tr valign="top">
                <td>BiLSTM<sup>c</sup> (vaping-related word vectors)</td>
                <td>0.98 (0.96-0.99)</td>
                <td>0.84</td>
                <td>0.87</td>
                <td>0.95</td>
              </tr>
              <tr valign="top">
                <td>CNN (GloVe<sup>d</sup> word vectors)</td>
                <td>0.99 (0.98-0.99)</td>
                <td>0.93</td>
                <td>0.89</td>
                <td>0.98</td>
              </tr>
              <tr valign="top">
                <td>LSTM (GloVe word vectors)</td>
                <td>0.99 (0.98-0.99)</td>
                <td>0.89</td>
                <td>0.94</td>
                <td>0.98</td>
              </tr>
              <tr valign="top">
                <td>LSTM-CNN (GloVe word vectors)</td>
                <td>0.99 (0.98-0.99)</td>
                <td>0.86</td>
                <td>0.96</td>
                <td>0.99</td>
              </tr>
              <tr valign="top">
                <td>BiLSTM (GloVe word vectors)</td>
                <td>0.99 (0.98-0.99)</td>
                <td>0.97</td>
                <td>0.88</td>
                <td>0.98</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table7fn1">
              <p><sup>a</sup>CNN: convolutional neural network.</p>
            </fn>
            <fn id="table7fn2">
              <p><sup>b</sup>LSTM: long short-term memory.</p>
            </fn>
            <fn id="table7fn3">
              <p><sup>c</sup>BiLSTM: bidirectional long short-term memory.</p>
            </fn>
            <fn id="table7fn4">
              <p><sup>d</sup>GloVe: Global Vectors for Word Representation.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <table-wrap position="float" id="table8">
          <label>Table 8</label>
          <caption>
            <p>Performance of sentiment classifiers.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="400"/>
            <col width="300"/>
            <col width="100"/>
            <col width="100"/>
            <col width="100"/>
            <thead>
              <tr valign="top">
                <td>Classifiers</td>
                <td>Area under the receiver operating <break/> characteristic curve (95% CI)</td>
                <td>Precision</td>
                <td>Recall</td>
                <td>F1</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>Logistic regression</td>
                <td>0.78 (0.71-0.84)</td>
                <td>0.73</td>
                <td>0.88</td>
                <td>0.82</td>
              </tr>
              <tr valign="top">
                <td>Random forest</td>
                <td>0.78 (0.70-0.83)</td>
                <td>0.78</td>
                <td>0.79</td>
                <td>0.82</td>
              </tr>
              <tr valign="top">
                <td>Support vector machine</td>
                <td>0.69 (0.64-0.78)</td>
                <td>0.66</td>
                <td>0.98</td>
                <td>0.75</td>
              </tr>
              <tr valign="top">
                <td>Naive Bayes</td>
                <td>0.75 (0.66-0.82)</td>
                <td>0.75</td>
                <td>0.79</td>
                <td>0.80</td>
              </tr>
              <tr valign="top">
                <td>CNN<sup>a</sup> (vaping-related word vectors)</td>
                <td>0.74 (0.66-0.81)</td>
                <td>0.73</td>
                <td>0.85</td>
                <td>0.80</td>
              </tr>
              <tr valign="top">
                <td>LSTM<sup>b</sup> (vaping-related word vectors)</td>
                <td>0.74 (0.69-0.82)</td>
                <td>0.75</td>
                <td>0.81</td>
                <td>0.81</td>
              </tr>
              <tr valign="top">
                <td>LSTM-CNN (vaping-related word vectors)</td>
                <td>0.75 (0.71-0.84)</td>
                <td>0.74</td>
                <td>0.91</td>
                <td>0.83</td>
              </tr>
              <tr valign="top">
                <td>BiLSTM<sup>c</sup> (vaping-related word vectors)</td>
                <td>0.74 (0.68-0.81)</td>
                <td>0.72</td>
                <td>0.91</td>
                <td>0.82</td>
              </tr>
              <tr valign="top">
                <td>CNN (GloVe<sup>d</sup> word vectors)</td>
                <td>0.81 (0.75-0.87)</td>
                <td>0.72</td>
                <td>0.96</td>
                <td>0.86</td>
              </tr>
              <tr valign="top">
                <td>LSTM (GloVe word vectors)</td>
                <td>0.78 (0.71-0.84)</td>
                <td>0.76</td>
                <td>0.82</td>
                <td>0.84</td>
              </tr>
              <tr valign="top">
                <td>LSTM-CNN (GloVe word vectors)</td>
                <td>0.80 (0.74-0.86)</td>
                <td>0.83</td>
                <td>0.84</td>
                <td>0.84</td>
              </tr>
              <tr valign="top">
                <td>BiLSTM (GloVe word vectors)</td>
                <td>0.83 (0.78-0.89)</td>
                <td>0.79</td>
                <td>0.79</td>
                <td>0.88</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table8fn1">
              <p><sup>a</sup>CNN: convolutional neural network.</p>
            </fn>
            <fn id="table8fn2">
              <p><sup>b</sup>LSTM: long short-term memory.</p>
            </fn>
            <fn id="table8fn3">
              <p><sup>c</sup>BiLSTM: bidirectional long short-term memory.</p>
            </fn>
            <fn id="table8fn4">
              <p><sup>d</sup>GloVe: Global Vectors for Word Representation.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
      </sec>
    </sec>
    <sec sec-type="discussion">
      <title>Discussion</title>
      <sec>
        <title>Principal Findings</title>
        <p>The relative prevalence of the 3 categories that we annotated in our data set reflects the general level of vaping-related discussions on Twitter. The high proportion of tweets of a commercial nature (72% of vaping-related tweets) reflects the observation that manufacturers of vaping products marketed their products heavily on Twitter. However, this percentage has likely decreased significantly since the beginning of 2020 because of the introduction of advertising restrictions by federal and state authorities. A high proportion of noncommercial tweets contained provape sentiments (62.39% of noncommercial tweets), suggesting that among Twitter users who post about vaping, the sentiment is overall more positive than negative in our data set, after the exclusion of marketing tweets. This reflects the growing prevalence of vaping, especially among adolescents who post more on Twitter than other age groups [<xref ref-type="bibr" rid="ref38">38</xref>]. However, as this study used data before the FDA banned a range of flavored e-cigarette cartridges, both vaping and positive sentiments related to vaping may have decreased significantly.</p>
        <p>Classifiers that we derived from our data set demonstrated high levels of performance, indicating that currently available machine learning methods can produce high-performing classifiers on a data set of only several thousand annotated tweets. Compared with traditional classifiers, deep learning classifiers had superior performance with AUC values of 0.96, 0.99, and 0.83 for predicting vaping-relevant, commercial, and provape tweets. Furthermore, our results indicate that deep learning classifiers performed the best with no preprocessing and with nondomain-specific GloVe word vectors. A few studies have shown that no preprocessing may provide better performance with Twitter data [<xref ref-type="bibr" rid="ref39">39</xref>,<xref ref-type="bibr" rid="ref40">40</xref>]. More generally, additional research is needed to systematically examine alternate preprocessing regimes for Twitter and other types of text data [<xref ref-type="bibr" rid="ref41">41</xref>]. Although deep learning classifiers are computationally more expensive to derive compared with traditional classifiers, the lack of preprocessing and derivation of domain-specific word vectors offsets the computational cost. Moreover, the application of deep learning classifiers to new Twitter data is as computationally efficient as traditional classifiers.</p>
        <p>Analyses of the 10 top-ranked features show that similar features appear across the classifiers. In addition to English terms, emojis and Unicode characters were often identified as useful features. Several common simple terms also appear as important features; these terms may interact with other features rather than being discriminatory on their own.</p>
      </sec>
      <sec>
        <title>Limitations</title>
        <p>Our study has several limitations. First, we used a small list of keywords to restrict our data, rather than using the full Twitter feed. As vaping products and their discussions evolve, the list of keywords will likely become stale and will need to be updated. Second, our annotated data set was of moderate size, though the sample size of 4000 tweets was adequate for obtaining classifiers with high performance. Third, the expression of tweet sentiments related to vaping is likely to vary over time [<xref ref-type="bibr" rid="ref42">42</xref>]. It would be useful to evaluate the performance of the classifiers on data that are obtained from a different period to assess the generalizability of the classifiers over time. Fourth, there may be geographical variation in sentiments regarding vaping [<xref ref-type="bibr" rid="ref43">43</xref>], and it would be useful to evaluate the performance of the classifiers on data that are obtained from different locations. In future work, we plan to address the limitations of evaluating the classifiers over time and location. Fifth, it is not clear if individuals with certain personality traits make them more predisposed to express positive or negative sentiments [<xref ref-type="bibr" rid="ref44">44</xref>]. More research is needed to assess the degree to which sentiment reflects variance in psychological traits versus the situational context in which those traits were expressed. Finally, this study uses data before the FDA banned a range of flavored e-cigarette cartridges that were likely to have been popular among frequent Twitter users, such as adolescents. In future work, we plan to derive classifiers from data that were collected after the FDA ban on flavored e-cigarette cartridges.</p>
      </sec>
      <sec>
        <title>Future Surveillance Research</title>
        <p>Machine learning classifiers, especially deep learning classifiers, show promising performance over strictly keyword-based approaches for identifying vaping-related tweets and sentiments related to vaping. This observation provides support for the development of a vaping surveillance system. Twitter surveillance can provide relatively inexpensive opportunities for monitoring the evolution of use and sentiment toward vaping and the effects of regulations on the marketing of vaping products. We plan to develop a surveillance system that will apply the classifiers to tweets to produce daily counts of vaping-related tweets, noncommercial tweets, and provape tweets. These daily counts will be used for future behavioral and attitudinal research related to vaping as well as for correlating changes in behavior and attitudes to changes in policy, such as those issued by the FDA. We plan to use the classifiers derived in this study as a basis for comparison with classifiers that we plan to derive from data obtained after the FDA ban to understand whether the ban has altered vaping-related health attitudes and behaviors. Furthermore, we plan to develop methods to infer the age group of the authors of tweets that will enable the daily tracking of vaping and related sentiments in adolescents.</p>
      </sec>
      <sec>
        <title>Conclusions</title>
        <p>We derived and evaluated machine learning classifiers to identify vaping-related relevant, commercial, and provape tweets. We developed a hierarchical classification scheme for vaping-related tweets and applied it to a data set of 4000 selected tweets to manually annotate them. We evaluated both traditional machine learning and deep learning classifiers using the annotated data set of 4000 tweets as well as vaping-related word vectors and GloVe word vectors that are derived from large unannotated tweet data sets. Overall, deep learning classifiers such as LSTM-CNN had superior performance and had the added advantage of requiring no preprocessing. These classifiers pave the way for the development of a vaping surveillance system.</p>
      </sec>
    </sec>
  </body>
  <back>
    <app-group>
      <supplementary-material id="app1">
        <label>Multimedia Appendix 1</label>
        <p>Additional information.</p>
        <media xlink:href="jmir_v22i8e17478_app1.docx" xlink:title="DOCX File , 2152 KB"/>
      </supplementary-material>
    </app-group>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">AUC</term>
          <def>
            <p>area under the receiver operating characteristic curve</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb2">BiLSTM</term>
          <def>
            <p>bidirectional long short-term memory</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb3">CNN</term>
          <def>
            <p>convolutional neural network</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb4">e-cigarettes</term>
          <def>
            <p>electronic cigarettes</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb5">FDA</term>
          <def>
            <p>Food and Drug Administration</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb6">GloVe</term>
          <def>
            <p>Global Vectors for Word Representation</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb7">LR</term>
          <def>
            <p>logistic regression</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb8">LSTM</term>
          <def>
            <p>long short-term memory</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb9">NB</term>
          <def>
            <p>naive Bayes</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb10">RF</term>
          <def>
            <p>random forest</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb11">RITHM</term>
          <def>
            <p>real-time infoveillance of Twitter health messages</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb12">SHAP</term>
          <def>
            <p>SHapley Additive exPlanations</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb13">SVM</term>
          <def>
            <p>support vector machine</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb14">TF-IDF</term>
          <def>
            <p>frequency-inverse document frequency</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <ack>
      <p>The authors thank Erica Barrett, Daria Williams, and Sarah Matheny for data annotation. This work was supported by awards from the National Cancer Institute of the National Institutes of Health (R01-CA225773), the National Library of Medicine of the National Institutes of Health (R01-LM012095), and the National Science Foundation (ACI-1548562 and ACI-1445606 to the Pittsburgh Supercomputing Center). The content is solely the responsibility of the authors and does not necessarily represent the ofﬁcial views of the National Institutes of Health or the National Science Foundation.</p>
    </ack>
    <fn-group>
      <fn fn-type="conflict">
        <p>None declared.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Rigotti</surname>
              <given-names>NA</given-names>
            </name>
          </person-group>
          <article-title>Balancing the benefits and harms of e-cigarettes: a national academies of science, engineering, and medicine report</article-title>
          <source>Ann Intern Med</source>
          <year>2018</year>
          <month>05</month>
          <day>1</day>
          <volume>168</volume>
          <issue>9</issue>
          <fpage>666</fpage>
          <lpage>7</lpage>
          <pub-id pub-id-type="doi">10.7326/M18-0251</pub-id>
          <pub-id pub-id-type="medline">29435573</pub-id>
          <pub-id pub-id-type="pii">2672813</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Czogala</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Goniewicz</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Fidelus</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Zielinska-Danch</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Travers</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Sobczak</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Secondhand exposure to vapors from electronic cigarettes</article-title>
          <source>Nicotine Tob Res</source>
          <year>2014</year>
          <month>06</month>
          <volume>16</volume>
          <issue>6</issue>
          <fpage>655</fpage>
          <lpage>62</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/24336346"/>
          </comment>
          <pub-id pub-id-type="doi">10.1093/ntr/ntt203</pub-id>
          <pub-id pub-id-type="medline">24336346</pub-id>
          <pub-id pub-id-type="pii">ntt203</pub-id>
          <pub-id pub-id-type="pmcid">PMC4565991</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Layden</surname>
              <given-names>JE</given-names>
            </name>
            <name name-style="western">
              <surname>Ghinai</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Pray</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Kimball</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Layer</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Tenforde</surname>
              <given-names>MW</given-names>
            </name>
            <name name-style="western">
              <surname>Navon</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Hoots</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Salvatore</surname>
              <given-names>PP</given-names>
            </name>
            <name name-style="western">
              <surname>Elderbrook</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Haupt</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Kanne</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Patel</surname>
              <given-names>MT</given-names>
            </name>
            <name name-style="western">
              <surname>Saathoff-Huber</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>King</surname>
              <given-names>BA</given-names>
            </name>
            <name name-style="western">
              <surname>Schier</surname>
              <given-names>JG</given-names>
            </name>
            <name name-style="western">
              <surname>Mikosz</surname>
              <given-names>CA</given-names>
            </name>
            <name name-style="western">
              <surname>Meiman</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Pulmonary illness related to e-cigarette use in Illinois and Wisconsin - final report</article-title>
          <source>N Engl J Med</source>
          <year>2020</year>
          <month>03</month>
          <day>5</day>
          <volume>382</volume>
          <issue>10</issue>
          <fpage>903</fpage>
          <lpage>16</lpage>
          <pub-id pub-id-type="doi">10.1056/NEJMoa1911614</pub-id>
          <pub-id pub-id-type="medline">31491072</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Triantafyllou</surname>
              <given-names>GA</given-names>
            </name>
            <name name-style="western">
              <surname>Tiberio</surname>
              <given-names>PJ</given-names>
            </name>
            <name name-style="western">
              <surname>Zou</surname>
              <given-names>RH</given-names>
            </name>
            <name name-style="western">
              <surname>Lamberty</surname>
              <given-names>PE</given-names>
            </name>
            <name name-style="western">
              <surname>Lynch</surname>
              <given-names>MJ</given-names>
            </name>
            <name name-style="western">
              <surname>Kreit</surname>
              <given-names>JW</given-names>
            </name>
            <name name-style="western">
              <surname>Gladwin</surname>
              <given-names>MT</given-names>
            </name>
            <name name-style="western">
              <surname>Morris</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Chiarchiaro</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Vaping-associated acute lung injury: a case series</article-title>
          <source>Am J Respir Crit Care Med</source>
          <year>2019</year>
          <month>12</month>
          <day>1</day>
          <volume>200</volume>
          <issue>11</issue>
          <fpage>1430</fpage>
          <lpage>1</lpage>
          <pub-id pub-id-type="doi">10.1164/rccm.201909-1809LE</pub-id>
          <pub-id pub-id-type="medline">31574235</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Farzal</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Perry</surname>
              <given-names>MF</given-names>
            </name>
            <name name-style="western">
              <surname>Yarbrough</surname>
              <given-names>WG</given-names>
            </name>
            <name name-style="western">
              <surname>Kimple</surname>
              <given-names>AJ</given-names>
            </name>
          </person-group>
          <article-title>The adolescent vaping epidemic in the United States-how it happened and where we go from here</article-title>
          <source>JAMA Otolaryngol Head Neck Surg</source>
          <year>2019</year>
          <month>08</month>
          <day>22</day>
          <volume>145</volume>
          <issue>10</issue>
          <fpage>885</fpage>
          <pub-id pub-id-type="doi">10.1001/jamaoto.2019.2410</pub-id>
          <pub-id pub-id-type="medline">31436792</pub-id>
          <pub-id pub-id-type="pii">2748897</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref6">
        <label>6</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Sidani</surname>
              <given-names>JE</given-names>
            </name>
            <name name-style="western">
              <surname>Colditz</surname>
              <given-names>JB</given-names>
            </name>
            <name name-style="western">
              <surname>Barrett</surname>
              <given-names>EL</given-names>
            </name>
            <name name-style="western">
              <surname>Shensa</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Chu</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>James</surname>
              <given-names>AE</given-names>
            </name>
            <name name-style="western">
              <surname>Primack</surname>
              <given-names>BA</given-names>
            </name>
          </person-group>
          <article-title>I wake up and hit the JUUL: analyzing Twitter for JUUL nicotine effects and dependence</article-title>
          <source>Drug Alcohol Depend</source>
          <year>2019</year>
          <month>11</month>
          <day>1</day>
          <volume>204</volume>
          <fpage>107500</fpage>
          <pub-id pub-id-type="doi">10.1016/j.drugalcdep.2019.06.005</pub-id>
          <pub-id pub-id-type="medline">31499242</pub-id>
          <pub-id pub-id-type="pii">S0376-8716(19)30259-5</pub-id>
          <pub-id pub-id-type="pmcid">PMC6878169</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref7">
        <label>7</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>AE</given-names>
            </name>
            <name name-style="western">
              <surname>Hansen</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Murphy</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Richards</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Duke</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Allen</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Methodological considerations in analyzing Twitter data</article-title>
          <source>J Natl Cancer Inst Monogr</source>
          <year>2013</year>
          <month>12</month>
          <volume>2013</volume>
          <issue>47</issue>
          <fpage>140</fpage>
          <lpage>6</lpage>
          <pub-id pub-id-type="doi">10.1093/jncimonographs/lgt026</pub-id>
          <pub-id pub-id-type="medline">24395983</pub-id>
          <pub-id pub-id-type="pii">lgt026</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref8">
        <label>8</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Duggan</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Brenner</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>The Demographics of Social Media Users — 2012</article-title>
          <source>Pew Research Center</source>
          <year>2013</year>
          <access-date>2020-07-18</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.pewresearch.org/internet/wp-content/uploads/sites/9/media/Files/Reports/2013/PIP_SocialMediaUsers.pdf">https://www.pewresearch.org/internet/wp-content/uploads/sites/9/media/Files/Reports/2013/PIP_SocialMediaUsers.pdf</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref9">
        <label>9</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Eaton</surname>
              <given-names>DK</given-names>
            </name>
            <name name-style="western">
              <surname>Kann</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Kinchen</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Shanklin</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Ross</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Hawkins</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Harris</surname>
              <given-names>WA</given-names>
            </name>
            <name name-style="western">
              <surname>Lowry</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>McManus</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Chyen</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Lim</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Whittle</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Brener</surname>
              <given-names>ND</given-names>
            </name>
            <name name-style="western">
              <surname>Wechsler</surname>
              <given-names>H</given-names>
            </name>
            <collab>Centers for Disease Control and Prevention (CDC)</collab>
          </person-group>
          <article-title>Youth risk behavior surveillance - United States, 2009</article-title>
          <source>MMWR Surveill Summ</source>
          <year>2010</year>
          <month>06</month>
          <day>4</day>
          <volume>59</volume>
          <issue>5</issue>
          <fpage>1</fpage>
          <lpage>142</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.cdc.gov/mmwr/preview/mmwrhtml/ss5905a1.htm"/>
          </comment>
          <pub-id pub-id-type="medline">20520591</pub-id>
          <pub-id pub-id-type="pii">ss5905a1</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref10">
        <label>10</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Colditz</surname>
              <given-names>JB</given-names>
            </name>
            <name name-style="western">
              <surname>Welling</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Smith</surname>
              <given-names>NA</given-names>
            </name>
            <name name-style="western">
              <surname>James</surname>
              <given-names>AE</given-names>
            </name>
            <name name-style="western">
              <surname>Primack</surname>
              <given-names>BA</given-names>
            </name>
          </person-group>
          <article-title>Journal of clinical outcomes management.World vaping day: contextualizing vaping culture in online social media using a mixed methods approach</article-title>
          <source>J Mix Methods Res</source>
          <year>2017</year>
          <month>04</month>
          <day>9</day>
          <volume>13</volume>
          <issue>2</issue>
          <fpage>196</fpage>
          <lpage>215</lpage>
          <pub-id pub-id-type="doi">10.1177/1558689817702753</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref11">
        <label>11</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Powell</surname>
              <given-names>GE</given-names>
            </name>
            <name name-style="western">
              <surname>Seifert</surname>
              <given-names>HA</given-names>
            </name>
            <name name-style="western">
              <surname>Reblin</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Burstein</surname>
              <given-names>PJ</given-names>
            </name>
            <name name-style="western">
              <surname>Blowers</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Menius</surname>
              <given-names>JA</given-names>
            </name>
            <name name-style="western">
              <surname>Painter</surname>
              <given-names>JL</given-names>
            </name>
            <name name-style="western">
              <surname>Thomas</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Pierce</surname>
              <given-names>CE</given-names>
            </name>
            <name name-style="western">
              <surname>Rodriguez</surname>
              <given-names>HW</given-names>
            </name>
            <name name-style="western">
              <surname>Brownstein</surname>
              <given-names>JS</given-names>
            </name>
            <name name-style="western">
              <surname>Freifeld</surname>
              <given-names>CC</given-names>
            </name>
            <name name-style="western">
              <surname>Bell</surname>
              <given-names>HG</given-names>
            </name>
            <name name-style="western">
              <surname>Dasgupta</surname>
              <given-names>N</given-names>
            </name>
          </person-group>
          <article-title>Social media listening for routine post-marketing safety surveillance</article-title>
          <source>Drug Saf</source>
          <year>2016</year>
          <month>05</month>
          <volume>39</volume>
          <issue>5</issue>
          <fpage>443</fpage>
          <lpage>54</lpage>
          <pub-id pub-id-type="doi">10.1007/s40264-015-0385-6</pub-id>
          <pub-id pub-id-type="medline">26798054</pub-id>
          <pub-id pub-id-type="pii">10.1007/s40264-015-0385-6</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref12">
        <label>12</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Massey</surname>
              <given-names>PM</given-names>
            </name>
            <name name-style="western">
              <surname>Leader</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Yom-Tov</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Budenz</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Fisher</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Klassen</surname>
              <given-names>AC</given-names>
            </name>
          </person-group>
          <article-title>Applying multiple data collection tools to quantify human Papillomavirus vaccine communication on Twitter</article-title>
          <source>J Med Internet Res</source>
          <year>2016</year>
          <month>12</month>
          <day>5</day>
          <volume>18</volume>
          <issue>12</issue>
          <fpage>e318</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.jmir.org/2016/12/e318/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/jmir.6670</pub-id>
          <pub-id pub-id-type="medline">27919863</pub-id>
          <pub-id pub-id-type="pii">v18i12e318</pub-id>
          <pub-id pub-id-type="pmcid">PMC5168526</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref13">
        <label>13</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>McClellan</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Ali</surname>
              <given-names>MM</given-names>
            </name>
            <name name-style="western">
              <surname>Mutter</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Kroutil</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Landwehr</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Using social media to monitor mental health discussions - evidence from Twitter</article-title>
          <source>J Am Med Inform Assoc</source>
          <year>2017</year>
          <month>05</month>
          <day>1</day>
          <volume>24</volume>
          <issue>3</issue>
          <fpage>496</fpage>
          <lpage>502</lpage>
          <pub-id pub-id-type="doi">10.1093/jamia/ocw133</pub-id>
          <pub-id pub-id-type="medline">27707822</pub-id>
          <pub-id pub-id-type="pii">ocw133</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref14">
        <label>14</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hanson</surname>
              <given-names>CL</given-names>
            </name>
            <name name-style="western">
              <surname>Cannon</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Burton</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Giraud-Carrier</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>An exploration of social circles and prescription drug abuse through Twitter</article-title>
          <source>J Med Internet Res</source>
          <year>2013</year>
          <month>09</month>
          <day>6</day>
          <volume>15</volume>
          <issue>9</issue>
          <fpage>e189</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.jmir.org/2013/9/e189/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/jmir.2741</pub-id>
          <pub-id pub-id-type="medline">24014109</pub-id>
          <pub-id pub-id-type="pii">v15i9e189</pub-id>
          <pub-id pub-id-type="pmcid">PMC3785991</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref15">
        <label>15</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Bian</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Topaloglu</surname>
              <given-names>U</given-names>
            </name>
            <name name-style="western">
              <surname>Yu</surname>
              <given-names>F</given-names>
            </name>
          </person-group>
          <article-title>Towards Large-scale Twitter Mining for Drug-Related Adverse Events</article-title>
          <source>Proceedings of the 2012 International Workshop on Smart Health and Wellbeing</source>
          <year>2012</year>
          <conf-name>SHB'12</conf-name>
          <conf-date>October 29, 2012</conf-date>
          <conf-loc>Maui, HI, USA</conf-loc>
          <fpage>25</fpage>
          <lpage>32</lpage>
          <pub-id pub-id-type="doi">10.1145/2389707.2389713</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref16">
        <label>16</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Prier</surname>
              <given-names>KW</given-names>
            </name>
            <name name-style="western">
              <surname>Smith</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Giraud-Carrier</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Hanson</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>Identifying Health-Related Topics on Twitter</article-title>
          <source>International Conference on Social Computing, Behavioral-Cultural Modeling, and Prediction</source>
          <year>2011</year>
          <conf-name>SBP'11</conf-name>
          <conf-date>March 29-31, 2011</conf-date>
          <conf-loc>College Park, MD, USA</conf-loc>
          <fpage>18</fpage>
          <lpage>25</lpage>
          <pub-id pub-id-type="doi">10.1007/978-3-642-19656-0_4</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref17">
        <label>17</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Myslín</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Zhu</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Chapman</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Conway</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Using twitter to examine smoking behavior and perceptions of emerging tobacco products</article-title>
          <source>J Med Internet Res</source>
          <year>2013</year>
          <month>08</month>
          <day>29</day>
          <volume>15</volume>
          <issue>8</issue>
          <fpage>e174</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.jmir.org/2013/8/e174/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/jmir.2534</pub-id>
          <pub-id pub-id-type="medline">23989137</pub-id>
          <pub-id pub-id-type="pii">v15i8e174</pub-id>
          <pub-id pub-id-type="pmcid">PMC3758063</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref18">
        <label>18</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Burton</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Dadich</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Soboleva</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Competing voices: marketing and counter-marketing alcohol on Twitter</article-title>
          <source>J Nonprofit Public Sect Mark</source>
          <year>2013</year>
          <month>04</month>
          <volume>25</volume>
          <issue>2</issue>
          <fpage>186</fpage>
          <lpage>209</lpage>
          <pub-id pub-id-type="doi">10.1080/10495142.2013.787836</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref19">
        <label>19</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Colditz</surname>
              <given-names>JB</given-names>
            </name>
            <name name-style="western">
              <surname>Chu</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Emery</surname>
              <given-names>SL</given-names>
            </name>
            <name name-style="western">
              <surname>Larkin</surname>
              <given-names>CR</given-names>
            </name>
            <name name-style="western">
              <surname>James</surname>
              <given-names>AE</given-names>
            </name>
            <name name-style="western">
              <surname>Welling</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Primack</surname>
              <given-names>BA</given-names>
            </name>
          </person-group>
          <article-title>Toward real-time infoveillance of Twitter health messages</article-title>
          <source>Am J Public Health</source>
          <year>2018</year>
          <month>08</month>
          <volume>108</volume>
          <issue>8</issue>
          <fpage>1009</fpage>
          <lpage>14</lpage>
          <pub-id pub-id-type="doi">10.2105/AJPH.2018.304497</pub-id>
          <pub-id pub-id-type="medline">29927648</pub-id>
          <pub-id pub-id-type="pmcid">PMC6050832</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref20">
        <label>20</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Eysenbach</surname>
              <given-names>G</given-names>
            </name>
          </person-group>
          <article-title>Infodemiology and infoveillance: framework for an emerging set of public health informatics methods to analyze search, communication and publication behavior on the internet</article-title>
          <source>J Med Internet Res</source>
          <year>2009</year>
          <month>03</month>
          <day>27</day>
          <volume>11</volume>
          <issue>1</issue>
          <fpage>e11</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.jmir.org/2009/1/e11/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/jmir.1157</pub-id>
          <pub-id pub-id-type="medline">19329408</pub-id>
          <pub-id pub-id-type="pii">v11i1e11</pub-id>
          <pub-id pub-id-type="pmcid">PMC2762766</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref21">
        <label>21</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Han</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Kavuluru</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>Exploratory Analysis of Marketing and Non-Marketing E-cigarette Themes on Twitter</article-title>
          <source>International Conference on Social Informatics</source>
          <year>2016</year>
          <conf-name>SocInfo'16</conf-name>
          <conf-date>November 11-14, 2016</conf-date>
          <conf-loc>Bellevue, USA</conf-loc>
          <fpage>307</fpage>
          <lpage>22</lpage>
          <pub-id pub-id-type="doi">10.1007/978-3-319-47874-6_22</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref22">
        <label>22</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Cole-Lewis</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Varghese</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Sanders</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Schwarz</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Pugatch</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Augustson</surname>
              <given-names>E</given-names>
            </name>
          </person-group>
          <article-title>Assessing electronic cigarette-related tweets for sentiment and content using supervised machine learning</article-title>
          <source>J Med Internet Res</source>
          <year>2015</year>
          <month>08</month>
          <day>25</day>
          <volume>17</volume>
          <issue>8</issue>
          <fpage>e208</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.jmir.org/2015/8/e208/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/jmir.4392</pub-id>
          <pub-id pub-id-type="medline">26307512</pub-id>
          <pub-id pub-id-type="pii">v17i8e208</pub-id>
          <pub-id pub-id-type="pmcid">PMC4642404</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref23">
        <label>23</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Huang</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Kornfield</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Szczypka</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Emery</surname>
              <given-names>SL</given-names>
            </name>
          </person-group>
          <article-title>A cross-sectional examination of marketing of electronic cigarettes on Twitter</article-title>
          <source>Tob Control</source>
          <year>2014</year>
          <month>07</month>
          <volume>23</volume>
          <issue>Suppl 3</issue>
          <fpage>iii26</fpage>
          <lpage>30</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://tobaccocontrol.bmj.com/cgi/pmidlookup?view=long&#38;pmid=24935894"/>
          </comment>
          <pub-id pub-id-type="doi">10.1136/tobaccocontrol-2014-051551</pub-id>
          <pub-id pub-id-type="medline">24935894</pub-id>
          <pub-id pub-id-type="pii">tobaccocontrol-2014-051551</pub-id>
          <pub-id pub-id-type="pmcid">PMC4078681</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref24">
        <label>24</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Resende</surname>
              <given-names>EC</given-names>
            </name>
            <name name-style="western">
              <surname>Culotta</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>A Demographic and Sentiment Analysis of E-cigarette Messages on Twitter</article-title>
          <source>Computer Science Department, Illinois Institute of Technology</source>
          <year>2015</year>
          <access-date>2020-07-21</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://cs.iit.edu/~culotta/pubs/resende15demographic.pdf">http://cs.iit.edu/~culotta/pubs/resende15demographic.pdf</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref25">
        <label>25</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Du</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Xu</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Song</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Tao</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>Optimization on machine learning based approaches for sentiment analysis on HPV vaccines related tweets</article-title>
          <source>J Biomed Semantics</source>
          <year>2017</year>
          <month>03</month>
          <day>3</day>
          <volume>8</volume>
          <issue>1</issue>
          <fpage>9</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://jbiomedsem.biomedcentral.com/articles/10.1186/s13326-017-0120-6"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/s13326-017-0120-6</pub-id>
          <pub-id pub-id-type="medline">28253919</pub-id>
          <pub-id pub-id-type="pii">10.1186/s13326-017-0120-6</pub-id>
          <pub-id pub-id-type="pmcid">PMC5335787</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref26">
        <label>26</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Du</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Xu</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Song</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Tao</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>Leveraging machine learning-based approaches to assess human papillomavirus vaccination sentiment trends with Twitter data</article-title>
          <source>BMC Med Inform Decis Mak</source>
          <year>2017</year>
          <month>07</month>
          <day>5</day>
          <volume>17</volume>
          <issue>Suppl 2</issue>
          <fpage>69</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://bmcmedinformdecismak.biomedcentral.com/articles/10.1186/s12911-017-0469-6"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/s12911-017-0469-6</pub-id>
          <pub-id pub-id-type="medline">28699569</pub-id>
          <pub-id pub-id-type="pii">10.1186/s12911-017-0469-6</pub-id>
          <pub-id pub-id-type="pmcid">PMC5506590</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref27">
        <label>27</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Cohen</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>A coefficient of agreement for nominal scales</article-title>
          <source>Educ Psychol Meas</source>
          <year>1960</year>
          <month>04</month>
          <volume>20</volume>
          <issue>1</issue>
          <fpage>37</fpage>
          <lpage>46</lpage>
          <pub-id pub-id-type="doi">10.1177/001316446002000104</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref28">
        <label>28</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Pang</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Vaithyanathan</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Thumbs Up?: Sentiment Classification Using Machine Learning Techniques</article-title>
          <source>Proceedings of the ACL-02 Conference on Empirical Methods in Natural Language Processing</source>
          <year>2002</year>
          <conf-name>EMNLP'02</conf-name>
          <conf-date>July 6-7, 2002</conf-date>
          <conf-loc>Philadelphia, PA, USA</conf-loc>
          <pub-id pub-id-type="doi">10.3115/1118693.1118704</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref29">
        <label>29</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Porter</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>An algorithm for suffix stripping</article-title>
          <source>Program</source>
          <year>2006</year>
          <month>07</month>
          <volume>40</volume>
          <issue>3</issue>
          <fpage>211</fpage>
          <lpage>8</lpage>
          <pub-id pub-id-type="doi">10.1108/00330330610681286</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref30">
        <label>30</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Loper</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Bird</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>NLTK: The Natural Language Toolkit</article-title>
          <source>Proceedings of the ACL-02 Workshop on Effective Tools and Methodologies for Teaching Natural Language Processing and Computational Linguistics (Vol. 1)</source>
          <year>2002</year>
          <conf-name>EMNLP '02</conf-name>
          <conf-date>July 6-7</conf-date>
          <conf-loc>Philadelphia, PA, USA</conf-loc>
          <pub-id pub-id-type="doi">10.3115/1118108.1118117</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref31">
        <label>31</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Pennington</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Socher</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Manning</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>GloVe: Global Vectors for Word Representation</article-title>
          <source>Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing</source>
          <year>2014</year>
          <conf-name>EMNLP'14</conf-name>
          <conf-date>October 25-29, 2014</conf-date>
          <conf-loc>Doha, Qatar</conf-loc>
          <fpage>1532</fpage>
          <lpage>43</lpage>
          <pub-id pub-id-type="doi">10.3115/v1/d14-1162</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref32">
        <label>32</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Mikolov</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Corrado</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Dean</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Efficient estimation of word representations in vector space</article-title>
          <source>arXiv preprint</source>
          <year>2013</year>
          <fpage>-</fpage>
          <comment>epub ahead of print<ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/1301.3781"/></comment>
        </nlm-citation>
      </ref>
      <ref id="ref33">
        <label>33</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Varoquaux</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Buitinck</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Louppe</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Grisel</surname>
              <given-names>O</given-names>
            </name>
            <name name-style="western">
              <surname>Pedregosa</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Mueller</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Scikit-learn</article-title>
          <source>J Mach Learn Res</source>
          <year>2015</year>
          <month>06</month>
          <volume>19</volume>
          <issue>1</issue>
          <fpage>29</fpage>
          <lpage>33</lpage>
          <pub-id pub-id-type="doi">10.1145/2786984.2786995</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref34">
        <label>34</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Gulli</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Pal</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <source>Deep Learning with Python</source>
          <year>2017</year>
          <publisher-loc>Birmingham, United Kingdom</publisher-loc>
          <publisher-name>Packt Publishing Ltd</publisher-name>
        </nlm-citation>
      </ref>
      <ref id="ref35">
        <label>35</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ballesteros</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Dyer</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Smith</surname>
              <given-names>NA</given-names>
            </name>
          </person-group>
          <article-title>Improved Transition-based Parsing by Modeling Characters instead of Words with LSTMs</article-title>
          <source>Proceedings of the 2015 Conference on Empirical Methods in Natural Language Processing</source>
          <year>2015</year>
          <conf-name>EMNLP 2015</conf-name>
          <conf-date>September 17-21</conf-date>
          <conf-loc>Lisbon, Portugal</conf-loc>
          <pub-id pub-id-type="doi">10.18653/v1/d15-1041</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref36">
        <label>36</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Huang</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Xu</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Yu</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>Bidirectional LSTM-CRF models for sequence tagging</article-title>
          <source>arXiv preprint</source>
          <year>2015</year>
          <fpage>-</fpage>
          <comment>epub ahead of print<ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/1508.01991"/></comment>
        </nlm-citation>
      </ref>
      <ref id="ref37">
        <label>37</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lundberg</surname>
              <given-names>SM</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>SI</given-names>
            </name>
          </person-group>
          <article-title>A Unified Approach to Interpreting Model Predictions</article-title>
          <source>Advances in Neural Information Processing Systems 30</source>
          <year>2017</year>
          <conf-name>NIPS'17</conf-name>
          <conf-date>December 4-9, 2017</conf-date>
          <conf-loc>Long Beach, CA</conf-loc>
          <fpage>4765</fpage>
          <lpage>74</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://papers.nips.cc/paper/7062-a-unified-approach-to-interpreting-model-predictions"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref38">
        <label>38</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>AE</given-names>
            </name>
            <name name-style="western">
              <surname>Chew</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Wenger</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Cress</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Bukowski</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Farrelly</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Hair</surname>
              <given-names>E</given-names>
            </name>
          </person-group>
          <article-title>Estimated ages of JUUL Twitter followers</article-title>
          <source>JAMA Pediatr</source>
          <year>2019</year>
          <month>07</month>
          <day>1</day>
          <volume>173</volume>
          <issue>7</issue>
          <fpage>690</fpage>
          <lpage>2</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/31107511"/>
          </comment>
          <pub-id pub-id-type="doi">10.1001/jamapediatrics.2019.0922</pub-id>
          <pub-id pub-id-type="medline">31107511</pub-id>
          <pub-id pub-id-type="pii">2733855</pub-id>
          <pub-id pub-id-type="pmcid">PMC6537819</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref39">
        <label>39</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hamidian</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Diab</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Rumor detection and classification for Twitter data</article-title>
          <source>arXiv preprint</source>
          <year>2019</year>
          <fpage>-</fpage>
          <comment>epub ahead of print<ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/1912.08926"/></comment>
        </nlm-citation>
      </ref>
      <ref id="ref40">
        <label>40</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zilincik</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Navrat</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Koskova</surname>
              <given-names>G</given-names>
            </name>
          </person-group>
          <article-title>Exploratory search on Twitter utilizing user feedback and multi-perspective microblog analysis</article-title>
          <source>PLoS One</source>
          <year>2013</year>
          <volume>8</volume>
          <issue>11</issue>
          <fpage>e78857</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://dx.plos.org/10.1371/journal.pone.0078857"/>
          </comment>
          <pub-id pub-id-type="doi">10.1371/journal.pone.0078857</pub-id>
          <pub-id pub-id-type="medline">24265724</pub-id>
          <pub-id pub-id-type="pii">PONE-D-13-29808</pub-id>
          <pub-id pub-id-type="pmcid">PMC3827108</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref41">
        <label>41</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Denny</surname>
              <given-names>MJ</given-names>
            </name>
            <name name-style="western">
              <surname>Spirling</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Program-electron libtext preprocessing for unsupervised learning: why it matters, when it misleads, and what to do about it</article-title>
          <source>Polit Anal</source>
          <year>2018</year>
          <month>03</month>
          <day>19</day>
          <volume>26</volume>
          <issue>2</issue>
          <fpage>168</fpage>
          <lpage>89</lpage>
          <pub-id pub-id-type="doi">10.1017/pan.2017.44</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref42">
        <label>42</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Padilla</surname>
              <given-names>JJ</given-names>
            </name>
            <name name-style="western">
              <surname>Kavak</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Lynch</surname>
              <given-names>CJ</given-names>
            </name>
            <name name-style="western">
              <surname>Gore</surname>
              <given-names>RJ</given-names>
            </name>
            <name name-style="western">
              <surname>Diallo</surname>
              <given-names>SY</given-names>
            </name>
          </person-group>
          <article-title>Temporal and spatiotemporal investigation of tourist attraction visit sentiment on Twitter</article-title>
          <source>PLoS One</source>
          <year>2018</year>
          <volume>13</volume>
          <issue>6</issue>
          <fpage>e0198857</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://dx.plos.org/10.1371/journal.pone.0198857"/>
          </comment>
          <pub-id pub-id-type="doi">10.1371/journal.pone.0198857</pub-id>
          <pub-id pub-id-type="medline">29902270</pub-id>
          <pub-id pub-id-type="pii">PONE-D-18-02998</pub-id>
          <pub-id pub-id-type="pmcid">PMC6002102</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref43">
        <label>43</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Gore</surname>
              <given-names>RJ</given-names>
            </name>
            <name name-style="western">
              <surname>Diallo</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Padilla</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>You are what you tweet: connecting the geographic variation in America's obesity rate to Twitter content</article-title>
          <source>PLoS One</source>
          <year>2015</year>
          <volume>10</volume>
          <issue>9</issue>
          <fpage>e0133505</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://dx.plos.org/10.1371/journal.pone.0133505"/>
          </comment>
          <pub-id pub-id-type="doi">10.1371/journal.pone.0133505</pub-id>
          <pub-id pub-id-type="medline">26332588</pub-id>
          <pub-id pub-id-type="pii">PONE-D-15-02269</pub-id>
          <pub-id pub-id-type="pmcid">PMC4557976</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref44">
        <label>44</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Auer</surname>
              <given-names>EML</given-names>
            </name>
          </person-group>
          <article-title>Detecting Deceptive Impression Management Behaviors in Interviews Using Natural Language Processing</article-title>
          <source>ODU Digital Commons</source>
          <year>2018</year>
          <access-date>2020-07-18</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://digitalcommons.odu.edu/psychology_etds/70/">https://digitalcommons.odu.edu/psychology_etds/70/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
