<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<?covid-19-tdm?>
<article xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article" dtd-version="2.0">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JMIR</journal-id>
      <journal-id journal-id-type="nlm-ta">J Med Internet Res</journal-id>
      <journal-title>Journal of Medical Internet Research</journal-title>
      <issn pub-type="epub">1438-8871</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
      <article-id pub-id-type="publisher-id">v23i1e25314</article-id>
      <article-id pub-id-type="pmid">33449904</article-id>
      <article-id pub-id-type="doi">10.2196/25314</article-id>
      <article-categories>
        <subj-group subj-group-type="heading">
          <subject>Original Paper</subject>
        </subj-group>
        <subj-group subj-group-type="article-type">
          <subject>Original Paper</subject>
        </subj-group>
      </article-categories>
      <title-group>
        <article-title>Toward Using Twitter for Tracking COVID-19: A Natural Language Processing Pipeline and Exploratory Data Set</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="editor">
          <name>
            <surname>Eysenbach</surname>
            <given-names>Gunther</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Verspoor</surname>
            <given-names>Karin</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Foufi</surname>
            <given-names>Vasiliki</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Ji</surname>
            <given-names>Xiaonan</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Sheets</surname>
            <given-names>Lincoln</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib id="contrib1" contrib-type="author" corresp="yes">
          <name name-style="western">
            <surname>Klein</surname>
            <given-names>Ari Z</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <address>
            <institution>Department of Biostatistics, Epidemiology, and Informatics</institution>
            <institution>Perelman School of Medicine</institution>
            <institution>University of Pennsylvania</institution>
            <addr-line>421A Blockley Hall</addr-line>
            <addr-line>423 Guardian Dr</addr-line>
            <addr-line>Philadelphia, PA, 19104</addr-line>
            <country>United States</country>
            <phone>1 215 746 1101</phone>
            <email>ariklein@pennmedicine.upenn.edu</email>
          </address>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-8281-3464</ext-link>
        </contrib>
        <contrib id="contrib2" contrib-type="author">
          <name name-style="western">
            <surname>Magge</surname>
            <given-names>Arjun</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-4109-1346</ext-link>
        </contrib>
        <contrib id="contrib3" contrib-type="author">
          <name name-style="western">
            <surname>O'Connor</surname>
            <given-names>Karen</given-names>
          </name>
          <degrees>MS</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-7709-3813</ext-link>
        </contrib>
        <contrib id="contrib4" contrib-type="author">
          <name name-style="western">
            <surname>Flores Amaro</surname>
            <given-names>Jesus Ivan</given-names>
          </name>
          <degrees>BS</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-1912-0112</ext-link>
        </contrib>
        <contrib id="contrib5" contrib-type="author">
          <name name-style="western">
            <surname>Weissenbacher</surname>
            <given-names>Davy</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-8331-3675</ext-link>
        </contrib>
        <contrib id="contrib6" contrib-type="author">
          <name name-style="western">
            <surname>Gonzalez Hernandez</surname>
            <given-names>Graciela</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-6416-9556</ext-link>
        </contrib>
      </contrib-group>
      <aff id="aff1">
        <label>1</label>
        <institution>Department of Biostatistics, Epidemiology, and Informatics</institution>
        <institution>Perelman School of Medicine</institution>
        <institution>University of Pennsylvania</institution>
        <addr-line>Philadelphia, PA</addr-line>
        <country>United States</country>
      </aff>
      <author-notes>
        <corresp>Corresponding Author: Ari Z Klein <email>ariklein@pennmedicine.upenn.edu</email></corresp>
      </author-notes>
      <pub-date pub-type="collection">
        <month>1</month>
        <year>2021</year>
      </pub-date>
      <pub-date pub-type="epub">
        <day>22</day>
        <month>1</month>
        <year>2021</year>
      </pub-date>
      <volume>23</volume>
      <issue>1</issue>
      <elocation-id>e25314</elocation-id>
      <history>
        <date date-type="received">
          <day>27</day>
          <month>10</month>
          <year>2020</year>
        </date>
        <date date-type="rev-request">
          <day>5</day>
          <month>12</month>
          <year>2020</year>
        </date>
        <date date-type="rev-recd">
          <day>14</day>
          <month>12</month>
          <year>2020</year>
        </date>
        <date date-type="accepted">
          <day>14</day>
          <month>12</month>
          <year>2020</year>
        </date>
      </history>
      <copyright-statement>©Ari Z Klein, Arjun Magge, Karen O'Connor, Jesus Ivan Flores Amaro, Davy Weissenbacher, Graciela Gonzalez Hernandez. Originally published in the Journal of Medical Internet Research (http://www.jmir.org), 22.01.2021.</copyright-statement>
      <copyright-year>2021</copyright-year>
      <license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
        <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (https://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in the Journal of Medical Internet Research, is properly cited. The complete bibliographic information, a link to the original publication on http://www.jmir.org/, as well as this copyright and license information must be included.</p>
      </license>
      <self-uri xlink:href="http://www.jmir.org/2021/1/e25314/" xlink:type="simple"/>
      <abstract>
        <sec sec-type="background">
          <title>Background</title>
          <p>In the United States, the rapidly evolving COVID-19 outbreak, the shortage of available testing, and the delay of test results present challenges for actively monitoring its spread based on testing alone.</p>
        </sec>
        <sec sec-type="objective">
          <title>Objective</title>
          <p>The objective of this study was to develop, evaluate, and deploy an automatic natural language processing pipeline to collect user-generated Twitter data as a complementary resource for identifying potential cases of COVID-19 in the United States that are not based on testing and, thus, may not have been reported to the Centers for Disease Control and Prevention.</p>
        </sec>
        <sec sec-type="methods">
          <title>Methods</title>
          <p>Beginning January 23, 2020, we collected English tweets from the Twitter Streaming application programming interface that mention keywords related to COVID-19. We applied handwritten regular expressions to identify tweets indicating that the user potentially has been exposed to COVID-19. We automatically filtered out “reported speech” (eg, quotations, news headlines) from the tweets that matched the regular expressions, and two annotators annotated a random sample of 8976 tweets that are geo-tagged or have profile location metadata, distinguishing tweets that self-report potential cases of COVID-19 from those that do not. We used the annotated tweets to train and evaluate deep neural network classifiers based on bidirectional encoder representations from transformers (BERT). Finally, we deployed the automatic pipeline on more than 85 million unlabeled tweets that were continuously collected between March 1 and August 21, 2020.</p>
        </sec>
        <sec sec-type="results">
          <title>Results</title>
          <p>Interannotator agreement, based on dual annotations for 3644 (41%) of the 8976 tweets, was 0.77 (Cohen κ). A deep neural network classifier, based on a BERT model that was pretrained on tweets related to COVID-19, achieved an F<sub>1</sub>-score of 0.76 (precision=0.76, recall=0.76) for detecting tweets that self-report potential cases of COVID-19. Upon deploying our automatic pipeline, we identified 13,714 tweets that self-report potential cases of COVID-19 and have US state–level geolocations.</p>
        </sec>
        <sec sec-type="conclusions">
          <title>Conclusions</title>
          <p>We have made the 13,714 tweets identified in this study, along with each tweet’s time stamp and US state–level geolocation, publicly available to download. This data set presents the opportunity for future work to assess the utility of Twitter data as a complementary resource for tracking the spread of COVID-19.</p>
        </sec>
      </abstract>
      <kwd-group>
        <kwd>natural language processing</kwd>
        <kwd>social media</kwd>
        <kwd>data mining</kwd>
        <kwd>COVID-19</kwd>
        <kwd>coronavirus</kwd>
        <kwd>pandemics</kwd>
        <kwd>epidemiology</kwd>
        <kwd>infodemiology</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <sec sec-type="introduction">
      <title>Introduction</title>
      <p>In the United States, the rapidly evolving COVID-19 outbreak, the shortage of available testing, and the delay of test results have presented challenges for actively monitoring the spread of COVID-19 based on testing alone. An approach that has emerged for detecting cases without the need for extensive testing relies on voluntary self-reports of symptoms from the general population [<xref ref-type="bibr" rid="ref1">1</xref>]. Considering that nearly one of every four adults in the United States already uses Twitter, and nearly half of them use it on a daily basis [<xref ref-type="bibr" rid="ref2">2</xref>], researchers have begun exploring tweets for mentions of COVID-19 symptoms [<xref ref-type="bibr" rid="ref3">3</xref>-<xref ref-type="bibr" rid="ref8">8</xref>]. However, considering the incubation period of COVID-19 [<xref ref-type="bibr" rid="ref9">9</xref>], detecting cases based on symptoms may not maximize the potential of Twitter data for real-time monitoring. The objective of this study was to develop, evaluate, and deploy a natural language processing (NLP) pipeline that automatically collects tweets reporting personal information more broadly—that is, beyond symptoms—that might indicate exposure to COVID-19 in the United States. In this paper, we present a publicly available data set containing 13,714 tweets that were identified by our automatic NLP pipeline between March 1 and August 21, 2020, with each tweet’s time stamp and US state–level geolocation. This data set presents the opportunity to explore the use of Twitter data as a complementary resource “to understand and model the transmission and trajectory of COVID-19” [<xref ref-type="bibr" rid="ref10">10</xref>].</p>
    </sec>
    <sec sec-type="methods">
      <title>Methods</title>
      <sec>
        <title>Data Collection and Annotation</title>
        <p>The Institutional Review Board (IRB) of the University of Pennsylvania reviewed this study and deemed it to be exempt human subjects research under Category (4) of Paragraph (b) of the US Code of Federal Regulations Title 45 Section 46.101 for publicly available data sources (45 CFR §46.101(b)(4)).</p>
        <p>Between January 23 and March 20, 2020, we collected more than 7 million publicly available tweets that mention keywords related to COVID-19, are posted in English, are not retweets, and are geo-tagged or have user profile location metadata. We developed handwritten regular expressions (<xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>)—search patterns designed to automatically match text strings—to identify a subset of the 7 million tweets that indicate that the user potentially has been exposed to COVID-19. Our query patterns were designed primarily to help identify potential cases of COVID-19 that are not based on testing and, thus, may not have been reported to the Centers for Disease Control and Prevention (CDC) [<xref ref-type="bibr" rid="ref11">11</xref>]. The regular expressions matched approximately 160,000 (2%) of the 7 million tweets. Approximately 30,000 (19%) of the 160,000 matching tweets were then automatically removed using a system we developed in recent work [<xref ref-type="bibr" rid="ref12">12</xref>] for filtering out “reported speech” (eg, quotations, news headlines) from health-related social media data.</p>
        <p>In preliminary work [<xref ref-type="bibr" rid="ref13">13</xref>], two annotators annotated a random sample of 10,000 of the 130,000 filtered tweets, and annotation guidelines (<xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>) were developed to help the annotators distinguish between three classes of tweets. However, since then, we have removed 1024 of the annotated tweets that were collected from the Twitter Streaming application programming interface (API) based on a keyword that we have stopped using, and we have unified two of the classes. “Potential case” tweets include those that indicate that the user or a member of the user’s household was denied testing for COVID-19, showing symptoms of COVID-19, potentially exposed to presumptive or confirmed cases of COVID-19, or had had experiences that pose a higher risk of exposure to COVID-19. “Other” tweets are related to COVID-19 and may discuss topics such as testing, symptoms, traveling, or social distancing, but do not indicate that the user or a member of the user’s household may be infected. Among the 8976 tweets, 3644 (41%) were annotated by both annotators. Upon resolving the annotators’ disagreements, 1456 (16%) of the tweets were annotated as “potential case” and 7520 (84%) as “other.” <xref ref-type="boxed-text" rid="box1">Textbox 1</xref> presents (slightly modified) sample tweets that match our handwritten regular expressions and were manually annotated as “potential case.”</p>
        <boxed-text id="box1" position="float">
          <title>Sample (slightly modified) tweets that match our handwritten regular expressions and were manually annotated as potential cases of COVID-19.</title>
          <list list-type="order">
            <list-item>
              <p>Nearly two weeks ago I had a fever, sore throat, runny nose, and cough. I want to know if it was coronavirus or just the common cold</p>
            </list-item>
            <list-item>
              <p>My coworker in next office probably has #coronavirus. He and his wife have the symptoms, but they went to the hospital to get tested and were refused.</p>
            </list-item>
            <list-item>
              <p>This girl in my class had the coronavirus, so I’m making an appointment with my doctor for a check up</p>
            </list-item>
            <list-item>
              <p>Pretty sure I had a patient tonight with Coronavirus. Had all the symptoms and tested negative for the flu.</p>
            </list-item>
            <list-item>
              <p>Why can celebrities, sports athletes &#38; politicians without symptoms get tested, but my symptomatic child who has a compromised immune system cannot? #coronavirus</p>
            </list-item>
            <list-item>
              <p>Since getting back from Seattle I’ve been sick and want to get a #coronavirus check. Called my PCP, they said to call health dept. Called them, they said I need to go thru my PCP. Called my PCP again, they said they can’t help me</p>
            </list-item>
            <list-item>
              <p>I’m convinced I have coronavirus. I’ve been to NYC, Phoenix, and San Diego in the last few weeks. I have a cough, a runny nose, and I’m really hot #covid19</p>
            </list-item>
            <list-item>
              <p>Scared of the coronavirus because I have a sore throat and a headache I think its just a cold but I take the tube 4 times a day</p>
            </list-item>
            <list-item>
              <p>Can’t even get testing SCHEDULED while self-quarantined (my decision) and having coronavirus symptoms I take train thru New Rochelle to Manhattan</p>
            </list-item>
            <list-item>
              <p>I have a bad cold. I went to the doctor, got some medications, the norm. But they couldn’t rule out coronavirus because they don’t have the tests.</p>
            </list-item>
          </list>
        </boxed-text>
        <p>As <xref ref-type="boxed-text" rid="box1">Textbox 1</xref> illustrates, our handwritten regular expressions are based on query patterns designed to identify tweets that report personal information that may be useful for tracking potential cases of COVID-19, including not only symptoms (tweet 1), but also exposure to potential cases and a lack of access to COVID-19 testing. For example, our regular expressions retrieve tweets reporting that the user may have come in contact with coworkers (tweet 2), classmates (tweet 3), patients (tweet 4), and family members (tweet 5) who may have COVID-19, and potential exposure to COVID-19 through traveling (tweets 6 and 7) and commutes (tweets 8 and 9). Our regular expressions also retrieve tweets reporting that the user (tweet 9 and 10), a family member (tweet 5), or someone else that the user has been in contact with (tweet 2) was denied access to testing, even though they are sick. Since none of the tweets in <xref ref-type="boxed-text" rid="box1">Textbox 1</xref> report being tested for or diagnosed with COVID-19, they represent potential cases that may not have been reported to the CDC.</p>
      </sec>
      <sec>
        <title>Automatic Classification and Geolocation</title>
        <p>We split the 8976 annotated tweets into 80% (7181 tweets) and 20% (1795 tweets) random sets—a training set (<xref ref-type="supplementary-material" rid="app3">Multimedia Appendix 3</xref>) and held-out test set, respectively—for automatic classification. We used the <italic>ktrain</italic> [<xref ref-type="bibr" rid="ref14">14</xref>] Python library to train and evaluate two supervised deep neural network classifiers based on bidirectional encoder representations from transformers (BERT): BERT-Base-Uncased [<xref ref-type="bibr" rid="ref15">15</xref>] and COVID-Twitter-BERT [<xref ref-type="bibr" rid="ref16">16</xref>]. After feeding the sequence of tweet tokens to BERT, the encoded representation is passed to a dropout layer (dropping rate of 0.1), followed by a dense layer with 2 units and a softmax activation, which predicts the class for each tweet. For training, we used Adam optimization with rate decay and warm-up. We used a batch size of 64, training runs for 3 epochs, and a maximum learning rate of 1 × 10<sup>-5</sup>. We fine-tuned all layers of the transformer model with our annotated tweets. Prior to automatic classification, we preprocessed the tweets by normalizing usernames and URLs, and lowercasing the text. <xref rid="figure1" ref-type="fig">Figure 1</xref> illustrates our automatic NLP pipeline for detecting tweets that indicate potential cases of COVID-19 in the United States. We deployed the pipeline on more than 85 million unlabeled tweets that were continuously collected between March 1 and August 21, 2020. We used Carmen [<xref ref-type="bibr" rid="ref17">17</xref>] to infer the geolocation—at the US state level—of tweets that the classifier predicted as potential cases.</p>
        <fig id="figure1" position="float">
          <label>Figure 1</label>
          <caption>
            <p>Automatic natural language processing (NLP) pipeline for detecting tweets that self-report potential cases of COVID-19 in the United States.</p>
          </caption>
          <graphic xlink:href="jmir_v23i1e25314_fig1.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
    </sec>
    <sec sec-type="results">
      <title>Results</title>
      <p>Interannotator agreement, based on dual annotations for 3644 (41%) of the 8976 tweets, was 0.77 (Cohen κ), considered “substantial agreement” [<xref ref-type="bibr" rid="ref18">18</xref>]. We evaluated two deep neural network classifiers on a held-out test set of 1795 (20%) of the 8976 tweets. The classifier based on the BERT-Base-Uncased pretrained model achieved an F<sub>1</sub>-score of 0.70 (precision=0.72, recall=0.67) for the “potential case” class, and the classifier based on the COVID-Twitter-BERT pretrained model achieved an F<sub>1</sub>-score of 0.76 (precision=0.76, recall=0.76), where:</p>
      <disp-formula>
        <graphic xlink:href="jmir_v23i1e25314_fig3.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
      </disp-formula>
      <p>We deployed our automatic pipeline, using the COVID-Twitter-BERT classifier, on more than 85 million unlabeled tweets that were continuously collected from the Twitter Streaming API between March 1 and August 21, 2020. Among the subset of tweets that were posted in English, not retweets, matched the regular expressions, and were not filtered out as reported speech, the COVID-Twitter-BERT classifier detected 13,714 “potential case” tweets for which Carmen inferred a US state–level geolocation. <xref rid="figure2" ref-type="fig">Figure 2</xref> illustrates the ranges of “potential case” tweets that were automatically detected per state. We automatically detected “potential case” tweets from all 50 states, with the highest numbers posted in California, New York, Texas, and Florida.</p>
      <fig id="figure2" position="float">
        <label>Figure 2</label>
        <caption>
          <p>Tweets self-reporting potential cases of COVID-19 in the United States, by state, between March 1 and August 21, 2020.</p>
        </caption>
        <graphic xlink:href="jmir_v23i1e25314_fig2.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
      </fig>
    </sec>
    <sec sec-type="discussion">
      <title>Discussion</title>
      <sec>
        <title>Principal Findings</title>
        <p>While Twitter data has been used to identify self-reports of symptoms by people who have tested positive for COVID-19 [<xref ref-type="bibr" rid="ref3">3</xref>,<xref ref-type="bibr" rid="ref4">4</xref>], the shortage of available testing and the delay of test results in the United States motivated us to assess whether Twitter data could be scaled to identify potential cases of COVID-19 that are not based on testing and, thus, may not have been reported to the CDC. There are studies that have not limited their exploration of COVID-19 symptoms on Twitter to users who have tested positive for COVID-19 [<xref ref-type="bibr" rid="ref5">5</xref>-<xref ref-type="bibr" rid="ref8">8</xref>]; however, limiting the detection of potential cases to symptoms may still underutilize the information available on Twitter. Our automatic NLP pipeline has detected potential cases of COVID-19 across the entire United States that are neither based on testing nor limited to symptoms, providing the opportunity to explore the utility of Twitter data more broadly as a complementary resource for tracking the spread of COVID-19. An analysis based on this data set is beyond the scope of this study. The 13,714 “potential case” tweets identified in this study can be downloaded using a Python script [<xref ref-type="bibr" rid="ref19">19</xref>] and the input file in <xref ref-type="supplementary-material" rid="app4">Multimedia Appendix 4</xref>, which contains the user ID, tweet ID, time stamp, and inferred state-level geolocation for each tweet. The script downloads the tweets that are still publicly available.</p>
      </sec>
      <sec>
        <title>Conclusions</title>
        <p>This paper presented an automatic NLP pipeline that was used to identify 13,714 tweets self-reporting potential cases of COVID-19 in the United States between March 1 and August 21, 2020, that may not have been reported to the CDC. This publicly available data set presents the opportunity for future work to assess the utility of Twitter data as a complementary resource for tracking the spread of COVID-19.</p>
      </sec>
    </sec>
  </body>
  <back>
    <app-group>
      <supplementary-material id="app1">
        <label>Multimedia Appendix 1</label>
        <p>Regular expressions.</p>
        <media xlink:href="jmir_v23i1e25314_app1.txt" xlink:title="TXT File , 3 KB"/>
      </supplementary-material>
      <supplementary-material id="app2">
        <label>Multimedia Appendix 2</label>
        <p>Annotation guidelines.</p>
        <media xlink:href="jmir_v23i1e25314_app2.pdf" xlink:title="PDF File  (Adobe PDF File), 1060 KB"/>
      </supplementary-material>
      <supplementary-material id="app3">
        <label>Multimedia Appendix 3</label>
        <p>Training data.</p>
        <media xlink:href="jmir_v23i1e25314_app3.txt" xlink:title="TXT File , 249 KB"/>
      </supplementary-material>
      <supplementary-material id="app4">
        <label>Multimedia Appendix 4</label>
        <p>Exploratory Twitter data set.</p>
        <media xlink:href="jmir_v23i1e25314_app4.txt" xlink:title="TXT File , 851 KB"/>
      </supplementary-material>
    </app-group>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">API</term>
          <def>
            <p>application programming interface</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb2">BERT</term>
          <def>
            <p>bidirectional encoder representations from transformers</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb3">CDC</term>
          <def>
            <p>Centers for Disease Control and Prevention</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb4">NLP</term>
          <def>
            <p>natural language processing</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <ack>
      <p>AZK contributed to the methodology, formal analysis, investigation, data curation, and writing the original draft. AM contributed to the software development, formal analysis, investigation, and writing the original draft. KO contributed to the data curation and writing (review and editing). JIFA contributed to the software development and writing (review and editing). DW contributed to the software development, formal analysis, investigation, and writing (review and editing). GGH contributed to the conceptualization, writing (review and editing), supervision, and funding acquisition. The authors would like to thank Alexis Upshur for contributing to annotating the Twitter data. This work was supported by the National Institutes of Health (NIH) National Library of Medicine (NLM; grant number R01LM011176) and National Institute of Allergy and Infectious Diseases (NIAID; grant number R01AI117011).</p>
    </ack>
    <fn-group>
      <fn fn-type="conflict">
        <p>None declared.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Menni</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Valdes</surname>
              <given-names>AM</given-names>
            </name>
            <name name-style="western">
              <surname>Freidin</surname>
              <given-names>MB</given-names>
            </name>
            <name name-style="western">
              <surname>Sudre</surname>
              <given-names>CH</given-names>
            </name>
            <name name-style="western">
              <surname>Nguyen</surname>
              <given-names>LH</given-names>
            </name>
            <name name-style="western">
              <surname>Drew</surname>
              <given-names>DA</given-names>
            </name>
            <name name-style="western">
              <surname>Ganesh</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Varsavsky</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Cardoso</surname>
              <given-names>MJ</given-names>
            </name>
            <name name-style="western">
              <surname>El-Sayed Moustafa</surname>
              <given-names>JS</given-names>
            </name>
            <name name-style="western">
              <surname>Visconti</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Hysi</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Bowyer</surname>
              <given-names>RCE</given-names>
            </name>
            <name name-style="western">
              <surname>Mangino</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Falchi</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Wolf</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Ourselin</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Chan</surname>
              <given-names>AT</given-names>
            </name>
            <name name-style="western">
              <surname>Steves</surname>
              <given-names>CJ</given-names>
            </name>
            <name name-style="western">
              <surname>Spector</surname>
              <given-names>TD</given-names>
            </name>
          </person-group>
          <article-title>Real-time tracking of self-reported symptoms to predict potential COVID-19</article-title>
          <source>Nat Med</source>
          <year>2020</year>
          <month>07</month>
          <day>11</day>
          <volume>26</volume>
          <issue>7</issue>
          <fpage>1037</fpage>
          <lpage>1040</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/32393804"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/s41591-020-0916-2</pub-id>
          <pub-id pub-id-type="medline">32393804</pub-id>
          <pub-id pub-id-type="pii">10.1038/s41591-020-0916-2</pub-id>
          <pub-id pub-id-type="pmcid">PMC7751267</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Smith</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Anderson</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Social media use in 2018</article-title>
          <source>Pew Research Center</source>
          <year>2018</year>
          <month>03</month>
          <day>01</day>
          <access-date>2020-09-29</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.pewresearch.org/internet/2018/03/01/social-media-use-in-2018/">https://www.pewresearch.org/internet/2018/03/01/social-media-use-in-2018/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Sarker</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Lakamana</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Hogg-Bremer</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Xie</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Al-Garadi</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>Y</given-names>
            </name>
          </person-group>
          <article-title>Self-reported COVID-19 symptoms on Twitter: an analysis and a research resource</article-title>
          <source>J Am Med Inform Assoc</source>
          <year>2020</year>
          <month>08</month>
          <day>01</day>
          <volume>27</volume>
          <issue>8</issue>
          <fpage>1310</fpage>
          <lpage>1315</lpage>
          <pub-id pub-id-type="doi">10.1093/jamia/ocaa116</pub-id>
          <pub-id pub-id-type="medline">32620975</pub-id>
          <pub-id pub-id-type="pii">5867237</pub-id>
          <pub-id pub-id-type="pmcid">PMC7337747</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Jeon</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Baruah</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Sarabadani</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Palanica</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Identification of Risk Factors and Symptoms of COVID-19: Analysis of Biomedical Literature and Social Media Data</article-title>
          <source>J Med Internet Res</source>
          <year>2020</year>
          <month>10</month>
          <day>02</day>
          <volume>22</volume>
          <issue>10</issue>
          <fpage>e20509</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.jmir.org/2020/10/e20509/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/20509</pub-id>
          <pub-id pub-id-type="medline">32936770</pub-id>
          <pub-id pub-id-type="pii">v22i10e20509</pub-id>
          <pub-id pub-id-type="pmcid">PMC7537723</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Mackey</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Purushothaman</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Shah</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Nali</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Bardier</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Liang</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Cai</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Cuomo</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>Machine Learning to Detect Self-Reporting of Symptoms, Testing Access, and Recovery Associated With COVID-19 on Twitter: Retrospective Big Data Infoveillance Study</article-title>
          <source>JMIR Public Health Surveill</source>
          <year>2020</year>
          <month>06</month>
          <day>08</day>
          <volume>6</volume>
          <issue>2</issue>
          <fpage>e19509</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://publichealth.jmir.org/2020/2/e19509/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/19509</pub-id>
          <pub-id pub-id-type="medline">32490846</pub-id>
          <pub-id pub-id-type="pii">v6i2e19509</pub-id>
          <pub-id pub-id-type="pmcid">PMC7282475</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref6">
        <label>6</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Panuganti</surname>
              <given-names>BA</given-names>
            </name>
            <name name-style="western">
              <surname>Jafari</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>MacDonald</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>DeConde</surname>
              <given-names>AS</given-names>
            </name>
          </person-group>
          <article-title>Predicting COVID-19 Incidence Using Anosmia and Other COVID-19 Symptomatology: Preliminary Analysis Using Google and Twitter</article-title>
          <source>Otolaryngol Head Neck Surg</source>
          <year>2020</year>
          <month>09</month>
          <volume>163</volume>
          <issue>3</issue>
          <fpage>491</fpage>
          <lpage>497</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://journals.sagepub.com/doi/10.1177/0194599820932128?url_ver=Z39.88-2003&#38;rfr_id=ori:rid:crossref.org&#38;rfr_dat=cr_pub%3dpubmed"/>
          </comment>
          <pub-id pub-id-type="doi">10.1177/0194599820932128</pub-id>
          <pub-id pub-id-type="medline">32484425</pub-id>
          <pub-id pub-id-type="pmcid">PMC7267744</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref7">
        <label>7</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Guntuku</surname>
              <given-names>SC</given-names>
            </name>
            <name name-style="western">
              <surname>Sherman</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Stokes</surname>
              <given-names>DC</given-names>
            </name>
            <name name-style="western">
              <surname>Agarwal</surname>
              <given-names>AK</given-names>
            </name>
            <name name-style="western">
              <surname>Seltzer</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Merchant</surname>
              <given-names>RM</given-names>
            </name>
            <name name-style="western">
              <surname>Ungar</surname>
              <given-names>LH</given-names>
            </name>
          </person-group>
          <article-title>Tracking Mental Health and Symptom Mentions on Twitter During COVID-19</article-title>
          <source>J Gen Intern Med</source>
          <year>2020</year>
          <month>09</month>
          <day>07</day>
          <volume>35</volume>
          <issue>9</issue>
          <fpage>2798</fpage>
          <lpage>2800</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/32638321"/>
          </comment>
          <pub-id pub-id-type="doi">10.1007/s11606-020-05988-8</pub-id>
          <pub-id pub-id-type="medline">32638321</pub-id>
          <pub-id pub-id-type="pii">10.1007/s11606-020-05988-8</pub-id>
          <pub-id pub-id-type="pmcid">PMC7340749</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref8">
        <label>8</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Guo</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Radloff</surname>
              <given-names>CL</given-names>
            </name>
            <name name-style="western">
              <surname>Wawrzynski</surname>
              <given-names>SE</given-names>
            </name>
            <name name-style="western">
              <surname>Cloyes</surname>
              <given-names>KG</given-names>
            </name>
          </person-group>
          <article-title>Mining twitter to explore the emergence of COVID-19 symptoms</article-title>
          <source>Public Health Nurs</source>
          <year>2020</year>
          <month>11</month>
          <day>16</day>
          <volume>37</volume>
          <issue>6</issue>
          <fpage>934</fpage>
          <lpage>940</lpage>
          <pub-id pub-id-type="doi">10.1111/phn.12809</pub-id>
          <pub-id pub-id-type="medline">32937679</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref9">
        <label>9</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lauer</surname>
              <given-names>SA</given-names>
            </name>
            <name name-style="western">
              <surname>Grantz</surname>
              <given-names>KH</given-names>
            </name>
            <name name-style="western">
              <surname>Bi</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Jones</surname>
              <given-names>FK</given-names>
            </name>
            <name name-style="western">
              <surname>Zheng</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Meredith</surname>
              <given-names>HR</given-names>
            </name>
            <name name-style="western">
              <surname>Azman</surname>
              <given-names>AS</given-names>
            </name>
            <name name-style="western">
              <surname>Reich</surname>
              <given-names>NG</given-names>
            </name>
            <name name-style="western">
              <surname>Lessler</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>The Incubation Period of Coronavirus Disease 2019 (COVID-19) From Publicly Reported Confirmed Cases: Estimation and Application</article-title>
          <source>Annals of Internal Medicine</source>
          <year>2020</year>
          <month>05</month>
          <day>05</day>
          <volume>172</volume>
          <issue>9</issue>
          <fpage>577</fpage>
          <lpage>582</lpage>
          <pub-id pub-id-type="doi">10.7326/m20-0504</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref10">
        <label>10</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Merchant</surname>
              <given-names>RM</given-names>
            </name>
            <name name-style="western">
              <surname>Lurie</surname>
              <given-names>N</given-names>
            </name>
          </person-group>
          <article-title>Social Media and Emergency Preparedness in Response to Novel Coronavirus</article-title>
          <source>JAMA</source>
          <year>2020</year>
          <month>05</month>
          <day>26</day>
          <volume>323</volume>
          <issue>20</issue>
          <fpage>2011</fpage>
          <lpage>2012</lpage>
          <pub-id pub-id-type="doi">10.1001/jama.2020.4469</pub-id>
          <pub-id pub-id-type="medline">32202611</pub-id>
          <pub-id pub-id-type="pii">2763596</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref11">
        <label>11</label>
        <nlm-citation citation-type="web">
          <article-title>United States COVID-19 cases and deaths by state</article-title>
          <source>Centers for Disease Control and Prevention</source>
          <access-date>2020-09-29</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://covid.cdc.gov/covid-data-tracker/#cases_totalcases">https://covid.cdc.gov/covid-data-tracker/#cases_totalcases</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref12">
        <label>12</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Klein</surname>
              <given-names>AZ</given-names>
            </name>
            <name name-style="western">
              <surname>Cai</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Weissenbacher</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Levine</surname>
              <given-names>LD</given-names>
            </name>
            <name name-style="western">
              <surname>Gonzalez-Hernandez</surname>
              <given-names>G</given-names>
            </name>
          </person-group>
          <article-title>A natural language processing pipeline to advance the use of Twitter data for digital epidemiology of adverse pregnancy outcomes</article-title>
          <source>Journal of Biomedical Informatics: X</source>
          <year>2020</year>
          <month>12</month>
          <volume>8</volume>
          <fpage>100076</fpage>
          <pub-id pub-id-type="doi">10.1016/j.yjbinx.2020.100076</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref13">
        <label>13</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Klein</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Magge</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>O'Connor</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Cai</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Weissenbacher</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Gonzalez-Hernandez</surname>
              <given-names>G</given-names>
            </name>
          </person-group>
          <article-title>A chronological and geographical analysis of personal reports of COVID-19 on Twitter</article-title>
          <source>medRxiv</source>
          <comment>Preprint published online on April 24, 2020</comment>
          <pub-id pub-id-type="doi">10.1101/2020.04.19.20069948</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref14">
        <label>14</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Maiya</surname>
              <given-names>AS</given-names>
            </name>
          </person-group>
          <article-title>ktrain: a low-code library for augmented machine learning</article-title>
          <source>arXiv</source>
          <comment>Preprint posted online on April 19, 2020.
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/2004.10703"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref15">
        <label>15</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Devlin</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Cheng</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Toutanova</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>BERT: pre-training of deep bidirectional transformers for language understanding</article-title>
          <year>2019</year>
          <conf-name>17th Annual Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (NAACL-HLT)</conf-name>
          <conf-date>June 2-7, 2019</conf-date>
          <conf-loc>Minneapolis, MN</conf-loc>
          <fpage>4171</fpage>
          <lpage>4186</lpage>
          <pub-id pub-id-type="doi">10.18653/v1/N19-1423</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref16">
        <label>16</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Müller</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Salathé</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Kummervold</surname>
              <given-names>P</given-names>
            </name>
          </person-group>
          <article-title>COVID-Twitter-BERT: a natural language processing model to analyse COVID-19 content on Twitter</article-title>
          <source>arXiv</source>
          <comment>Preprint posted online on May 15, 2020.
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/2005.07503"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref17">
        <label>17</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Drezde</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Paul</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Bergsma</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Tran</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>Carmen: a Twitter geo-location system with applications to public health</article-title>
          <year>2013</year>
          <conf-name>Association for the Advancement of Artificial Intelligence (AIII) 2013 Workshop Expanding the Boundaries of Health Informatics Using AI</conf-name>
          <conf-date>July 14-15, 2013</conf-date>
          <conf-loc>Bellevue, WA, USA</conf-loc>
        </nlm-citation>
      </ref>
      <ref id="ref18">
        <label>18</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Viera</surname>
              <given-names>AJ</given-names>
            </name>
            <name name-style="western">
              <surname>Garrett</surname>
              <given-names>JM</given-names>
            </name>
          </person-group>
          <article-title>Understanding interobserver agreement: the kappa statistic</article-title>
          <source>Fam Med</source>
          <year>2005</year>
          <month>05</month>
          <volume>37</volume>
          <issue>5</issue>
          <fpage>360</fpage>
          <lpage>3</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://www.stfm.org/fmhub/fm2005/May/Anthony360.pdf"/>
          </comment>
          <pub-id pub-id-type="medline">15883903</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref19">
        <label>19</label>
        <nlm-citation citation-type="web">
          <article-title>Study data set</article-title>
          <source>Bitbucket</source>
          <access-date>2021-01-18</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://bitbucket.org/pennhlp/twitter_data_download/src/master/">https://bitbucket.org/pennhlp/twitter_data_download/src/master/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
