<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" article-type="letter" dtd-version="2.0">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JMIR</journal-id>
      <journal-id journal-id-type="nlm-ta">J Med Internet Res</journal-id>
      <journal-title>Journal of Medical Internet Research</journal-title>
      <issn pub-type="epub">1438-8871</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
      <article-id pub-id-type="publisher-id">v25i1e46484</article-id>
      <article-id pub-id-type="pmid">37399062</article-id>
      <article-id pub-id-type="doi">10.2196/46484</article-id>
      <article-categories>
        <subj-group subj-group-type="heading">
          <subject>Research Letter</subject>
        </subj-group>
        <subj-group subj-group-type="article-type">
          <subject>Research Letter</subject>
        </subj-group>
      </article-categories>
      <title-group>
        <article-title>Automatically Identifying Self-Reports of COVID-19 Diagnosis on Twitter: An Annotated Data Set, Deep Neural Network Classifiers, and a Large-Scale Cohort</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="editor">
          <name>
            <surname>Mavragani</surname>
            <given-names>Amaryllis</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Khademi</surname>
            <given-names>Sedigh</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Omranian</surname>
            <given-names>Sammie</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib id="contrib1" contrib-type="author">
          <name name-style="western">
            <surname>Klein</surname>
            <given-names>Ari Z</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-8281-3464</ext-link>
        </contrib>
        <contrib id="contrib2" contrib-type="author">
          <name name-style="western">
            <surname>Kunatharaju</surname>
            <given-names>Shriya</given-names>
          </name>
          <degrees>MS</degrees>
          <xref rid="aff2" ref-type="aff">2</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-6042-1745</ext-link>
        </contrib>
        <contrib id="contrib3" contrib-type="author">
          <name name-style="western">
            <surname>O'Connor</surname>
            <given-names>Karen</given-names>
          </name>
          <degrees>MS</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-7709-3813</ext-link>
        </contrib>
        <contrib id="contrib4" contrib-type="author" corresp="yes">
          <name name-style="western">
            <surname>Gonzalez-Hernandez</surname>
            <given-names>Graciela</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff3" ref-type="aff">3</xref>
          <address>
            <institution>Department of Computational Biomedicine</institution>
            <institution>Cedars-Sinai Medical Center</institution>
            <addr-line>Pacific Design Center, Ste G549F</addr-line>
            <addr-line>700 N San Vicente Blvd</addr-line>
            <addr-line>West Hollywood, CA, 90069</addr-line>
            <country>United States</country>
            <phone>1 310 423 3521</phone>
            <email>Graciela.GonzalezHernandez@csmc.edu</email>
          </address>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-6416-9556</ext-link>
        </contrib>
      </contrib-group>
      <aff id="aff1">
        <label>1</label>
        <institution>Department of Biostatistics, Epidemiology, and Informatics</institution>
        <institution>Perelman School of Medicine</institution>
        <institution>University of Pennsylvania</institution>
        <addr-line>Philadelphia, PA</addr-line>
        <country>United States</country>
      </aff>
      <aff id="aff2">
        <label>2</label>
        <institution>Autism Spectrum Program of Excellence</institution>
        <institution>Perelman School of Medicine</institution>
        <institution>University of Pennsylvania</institution>
        <addr-line>Philadelphia, PA</addr-line>
        <country>United States</country>
      </aff>
      <aff id="aff3">
        <label>3</label>
        <institution>Department of Computational Biomedicine</institution>
        <institution>Cedars-Sinai Medical Center</institution>
        <addr-line>West Hollywood, CA</addr-line>
        <country>United States</country>
      </aff>
      <author-notes>
        <corresp>Corresponding Author: Graciela Gonzalez-Hernandez <email>Graciela.GonzalezHernandez@csmc.edu</email></corresp>
      </author-notes>
      <pub-date pub-type="collection">
        <year>2023</year>
      </pub-date>
      <pub-date pub-type="epub">
        <day>3</day>
        <month>7</month>
        <year>2023</year>
      </pub-date>
      <volume>25</volume>
      <elocation-id>e46484</elocation-id>
      <history>
        <date date-type="received">
          <day>13</day>
          <month>2</month>
          <year>2023</year>
        </date>
        <date date-type="rev-request">
          <day>12</day>
          <month>4</month>
          <year>2023</year>
        </date>
        <date date-type="rev-recd">
          <day>3</day>
          <month>5</month>
          <year>2023</year>
        </date>
        <date date-type="accepted">
          <day>25</day>
          <month>5</month>
          <year>2023</year>
        </date>
      </history>
      <copyright-statement>©Ari Z Klein, Shriya Kunatharaju, Karen O'Connor, Graciela Gonzalez-Hernandez. Originally published in the Journal of Medical Internet Research (https://www.jmir.org), 03.07.2023.</copyright-statement>
      <copyright-year>2023</copyright-year>
      <license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
        <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (https://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in the Journal of Medical Internet Research, is properly cited. The complete bibliographic information, a link to the original publication on https://www.jmir.org/, as well as this copyright and license information must be included.</p>
      </license>
      <self-uri xlink:href="https://www.jmir.org/2023/1/e46484" xlink:type="simple"/>
      <kwd-group>
        <kwd>natural language processing</kwd>
        <kwd>data mining</kwd>
        <kwd>social media</kwd>
        <kwd>COVID-19</kwd>
        <kwd>Twitter</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <sec sec-type="introduction">
      <title>Introduction</title>
      <p>Studies have shown that Twitter can be a complementary source of data for monitoring personal experiences of COVID-19, such as symptoms [<xref ref-type="bibr" rid="ref1">1</xref>-<xref ref-type="bibr" rid="ref8">8</xref>]. Given the lack of manually annotated training data for supervised machine learning, however, these studies relied on other methods to identify English-language tweets that self-report a COVID-19 infection, including keywords [<xref ref-type="bibr" rid="ref1">1</xref>-<xref ref-type="bibr" rid="ref3">3</xref>], regular expressions [<xref ref-type="bibr" rid="ref4">4</xref>,<xref ref-type="bibr" rid="ref5">5</xref>], transfer learning [<xref ref-type="bibr" rid="ref6">6</xref>], self-supervised learning [<xref ref-type="bibr" rid="ref7">7</xref>], and unsupervised learning [<xref ref-type="bibr" rid="ref8">8</xref>]. As Mackey et al [<xref ref-type="bibr" rid="ref8">8</xref>] suggest, “supervised models that can leverage validated training sets are likely to have a much higher performance… and could likely achieve classification closer to real time.” The objective of this study was to develop and deploy a manually annotated data set and benchmark classification models for automatically identifying users who have self-reported a COVID-19 diagnosis. To validate self-reports of COVID-19 infection, we included only tweets that provide evidence of a diagnosis, such as a positive test, clinical diagnosis, or hospitalization.</p>
    </sec>
    <sec sec-type="methods">
      <title>Methods</title>
      <sec>
        <title>Ethical Considerations</title>
        <p>The institutional review boards of the University of Pennsylvania and Cedars-Sinai Medical Center reviewed this study and deemed this human subjects research as exempt.</p>
      </sec>
      <sec>
        <title>Data Collection</title>
        <p>Between July 2020 and May 2021, we collected approximately 600,000 English-language tweets, excluding retweets, from the Twitter streaming application programming interface (API) that included keywords related to both COVID-19 and a test, diagnosis, or hospitalization as a tokenized match (<xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). For tweets that mentioned a test, we also required them to include the keyword <italic>positive</italic>. We then searched these tweets for personal references to the user and automatically excluded tweets with select references to other people who were assumed not to be members of the user’s household. The full query (<xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>) returned 70,319 tweets that were posted by 58,847 users.</p>
      </sec>
      <sec>
        <title>Annotation</title>
        <p>We randomly sampled 10,000 (14%) of the 70,319 tweets, posted by unique users, and developed annotation guidelines (<xref ref-type="supplementary-material" rid="app3">Multimedia Appendix 3</xref>) to help 3 annotators distinguish tweets that self-reported a COVID-19 diagnosis from those that did not. Among the 10,000 tweets, 9000 (90%) were annotated by 2 annotators and 1000 (10%) were annotated by all 3 annotators. Interannotator agreement (Fleiss κ), based on these 1000 tweets, was 0.79. After resolving the disagreements among all 10,000 tweets, 1728 (17%) were annotated as self-reporting a COVID-19 diagnosis and 8272 (83%) as not.</p>
      </sec>
      <sec>
        <title>Automatic Classification</title>
        <p>We split the 10,000 tweets into 80% and 20% random sets as training data (<xref ref-type="supplementary-material" rid="app4">Multimedia Appendix 4</xref>) and held-out test data, respectively, and performed machine learning experiments using 5 deep neural network classifiers based on bidirectional encoder representations from transformers (BERT) [<xref ref-type="bibr" rid="ref9">9</xref>]. We preprocessed the tweets by normalizing URLs and usernames and lowercasing the text. For training, we used Adam optimization, a batch size of 8, 5 epochs, and a learning rate of 0.00001, based on evaluating models after each epoch using a 5% split of the training set. We fine-tuned all layers of the transformer models with our annotated tweets.</p>
      </sec>
    </sec>
    <sec sec-type="results">
      <title>Results</title>
      <p><xref ref-type="table" rid="table1">Table 1</xref> presents the performance of the classifiers. The COVID-Twitter-BERT classifier, based on a BERT model that was pretrained on tweets related to COVID-19 [<xref ref-type="bibr" rid="ref10">10</xref>], achieved the highest <italic>F</italic><sub>1</sub>-score: 0.94 (precision=0.96, recall=0.91). We deployed the classifier on 948,859 unlabeled tweets retrieved by our query (<xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>) through January 2023, and 222,084 of them were detected as self-reports of a COVID-19 diagnosis, posted by 181,521 users (<xref ref-type="supplementary-material" rid="app5">Multimedia Appendix 5</xref>). To validate precision over time, we annotated 1500 automatically classified tweets that were posted up to 15 months after our initial data collection, identifying 1451 true positives (precision=0.97).</p>
      <p><xref ref-type="table" rid="table2">Table 2</xref> presents examples of false positives and false negatives of the COVID-Twitter-BERT classifier in the test set. Among the 12 false positives, 4 (33%) were reported speech, such as quotations (tweet 1), and 2 (17%) reported a positive antibody test (tweet 2), which were annotated as “positive” when the tweet did not imply that the test result may have been associated with vaccination. Among the 29 false negatives, 11 (38%) reported being hospitalized (tweet 3), 3 (10%) mentioned a negative COVID-19 test (tweet 4), and another 3 (10%) reported receiving treatment for COVID-19 (tweet 5).</p>
      <table-wrap position="float" id="table1">
        <label>Table 1</label>
        <caption>
          <p>Precision, recall, and <italic>F</italic><sub>1</sub>-scores of deep neural network classifiers for the class of tweets that self-report a COVID-19 diagnosis, evaluated on a held-out test set of 2000 manually annotated tweets.</p>
        </caption>
        <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
          <col width="330"/>
          <col width="220"/>
          <col width="220"/>
          <col width="230"/>
          <thead>
            <tr valign="top">
              <td>Classifier</td>
              <td>Precision</td>
              <td>Recall</td>
              <td><italic>F</italic><sub>1</sub>-score</td>
            </tr>
          </thead>
          <tbody>
            <tr valign="top">
              <td>BERT-Base-Uncased</td>
              <td>0.82</td>
              <td>0.85</td>
              <td>0.84</td>
            </tr>
            <tr valign="top">
              <td>DistilBERT-Base-Uncased</td>
              <td>0.83</td>
              <td>0.77</td>
              <td>0.80</td>
            </tr>
            <tr valign="top">
              <td>RoBERTa-Large</td>
              <td>0.87</td>
              <td>0.92</td>
              <td>0.90</td>
            </tr>
            <tr valign="top">
              <td>BERTweet-Large</td>
              <td>0.90</td>
              <td>0.91</td>
              <td>0.91</td>
            </tr>
            <tr valign="top">
              <td>COVID-Twitter-BERT</td>
              <td>0.96</td>
              <td>0.91</td>
              <td>0.94</td>
            </tr>
          </tbody>
        </table>
      </table-wrap>
      <table-wrap position="float" id="table2">
        <label>Table 2</label>
        <caption>
          <p>Sample false-positive and false-negative tweets of the COVID-Twitter-BERT classifier (with the keywords that matched the data collection query in italics).</p>
        </caption>
        <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
          <col width="80"/>
          <col width="720"/>
          <col width="80"/>
          <col width="120"/>
          <thead>
            <tr valign="top">
              <td>Number</td>
              <td>Tweet</td>
              <td>Actual</td>
              <td>Predicted</td>
            </tr>
          </thead>
          <tbody>
            <tr valign="top">
              <td>1</td>
              <td>“<italic>I</italic> am always advocating for people to get the vaccine,“ says @QCC_CUNY Public Safety Specialist Doodnauth Singh. ”It is safe and has been <italic>tested</italic> a lot. <italic>I</italic> am in excellent health, but <italic>tested positive</italic> for <italic>COVID</italic> in December. Stay safe, not sorry.“</td>
              <td>–</td>
              <td>+</td>
            </tr>
            <tr valign="top">
              <td>2</td>
              <td><italic>I</italic> just received the results of <italic>my COVID</italic> Antibody <italic>test</italic>. After 6 months from <italic>my</italic> 2nd shot, <italic>I</italic> am happy to report that <italic>I tested POSITIVE</italic>!!!!</td>
              <td>–</td>
              <td>+</td>
            </tr>
            <tr valign="top">
              <td>3</td>
              <td>After another night <italic>in the hospital I</italic>’ve decided <italic>I</italic> won’t let <italic>Covid</italic> take <italic>me</italic> out! <italic>I</italic>’m Hanging on!</td>
              <td>+</td>
              <td>–</td>
            </tr>
            <tr valign="top">
              <td>4</td>
              <td><italic>Me</italic> and <italic>my</italic> bf literally sleep in the same bed everyday his <italic>covid test</italic> was negative mines was <italic>positive</italic> this is crazy <inline-graphic xlink:href="jmir_v25i1e46484_fig1.png" xlink:type="simple" mimetype="image"/></td>
              <td>+</td>
              <td>–</td>
            </tr>
            <tr valign="top">
              <td>5</td>
              <td><italic>I</italic>'ve had and recovered from <italic>covid</italic> getting monoclonal antibodies. <italic>I</italic> got the J &#38; J vaccine. <italic>I</italic> read that <italic>I</italic> have a 90% chance of not contracting <italic>covid</italic> again and a 100% chance of not being <italic>hospitalized</italic>. Are these numbers true?</td>
              <td>+</td>
              <td>–</td>
            </tr>
          </tbody>
        </table>
      </table-wrap>
    </sec>
    <sec sec-type="discussion">
      <title>Discussion</title>
      <p>The benchmark performance of supervised classification demonstrates the utility of our annotated training data (<xref ref-type="supplementary-material" rid="app4">Multimedia Appendix 4</xref>) for automatically identifying Twitter users who have self-reported a COVID-19 infection, facilitating the use of Twitter data for monitoring personal experiences of COVID-19 in real time. Although our approach is limited to users who report evidence of a diagnosis, our deployment demonstrates that users can be identified on a large scale (<xref ref-type="supplementary-material" rid="app5">Multimedia Appendix 5</xref>).</p>
    </sec>
  </body>
  <back>
    <app-group>
      <supplementary-material id="app1">
        <label>Multimedia Appendix 1</label>
        <p>Twitter API keywords for tokenized tweet matching.</p>
        <media xlink:href="jmir_v25i1e46484_app1.txt" xlink:title="TXT File , 1 KB"/>
      </supplementary-material>
      <supplementary-material id="app2">
        <label>Multimedia Appendix 2</label>
        <p>Data collection query.</p>
        <media xlink:href="jmir_v25i1e46484_app2.txt" xlink:title="TXT File , 2 KB"/>
      </supplementary-material>
      <supplementary-material id="app3">
        <label>Multimedia Appendix 3</label>
        <p>Annotation guidelines.</p>
        <media xlink:href="jmir_v25i1e46484_app3.docx" xlink:title="DOCX File , 115 KB"/>
      </supplementary-material>
      <supplementary-material id="app4">
        <label>Multimedia Appendix 4</label>
        <p>Training data.</p>
        <media xlink:href="jmir_v25i1e46484_app4.txt" xlink:title="TXT File , 180 KB"/>
      </supplementary-material>
      <supplementary-material id="app5">
        <label>Multimedia Appendix 5</label>
        <p>Large-scale cohort.</p>
        <media xlink:href="jmir_v25i1e46484_app5.txt" xlink:title="TXT File , 4554 KB"/>
      </supplementary-material>
    </app-group>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">API</term>
          <def>
            <p>application programming interface</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb2">BERT</term>
          <def>
            <p>bidirectional encoder representations from transformers</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <ack>
      <p>This work was supported by the National Library of Medicine (R01LM011176). The authors thank Ivan Flores for contributing to software applications and Alexis Upshur for contributing to annotating the Twitter data.</p>
    </ack>
    <notes>
      <sec>
        <title>Data Availability</title>
        <p>The manually annotated training data and unlabeled data resulting from the automatic classification are included with this published article in its supplementary information files, as <xref ref-type="supplementary-material" rid="app4">Multimedia Appendices 4</xref> and <xref ref-type="supplementary-material" rid="app5">5</xref>, respectively. In accordance with the Twitter Terms of Service, these tweets are made available as tweet IDs, which can be rehydrated as tweet objects if they remain public at the time they are requested through the Twitter API.</p>
      </sec>
    </notes>
    <fn-group>
      <fn fn-type="con">
        <p>AZK contributed to the data collection, machine learning experiments, error analysis, and writing the paper. SK contributed to the annotation, machine learning experiments, and writing the paper. KO contributed to the annotation guidelines, annotation, and editing the paper. GGH contributed to the study design and editing the paper.</p>
      </fn>
      <fn fn-type="conflict">
        <p>None declared.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Krittanawong</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Narasimhan</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Virk</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Narasimhan</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Tang</surname>
              <given-names>W</given-names>
            </name>
          </person-group>
          <article-title>Insights from Twitter about novel COVID-19 symptoms</article-title>
          <source>Eur Heart J Digit Health</source>
          <year>2020</year>
          <month>11</month>
          <volume>1</volume>
          <issue>1</issue>
          <fpage>4</fpage>
          <lpage>5</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/34192272"/>
          </comment>
          <pub-id pub-id-type="doi">10.1093/ehjdh/ztaa003</pub-id>
          <pub-id pub-id-type="medline">34192272</pub-id>
          <pub-id pub-id-type="pii">ztaa003</pub-id>
          <pub-id pub-id-type="pmcid">PMC7799127</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Banda</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Adderley</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Ahmed</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>AlGhoul</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Alser</surname>
              <given-names>O</given-names>
            </name>
            <name name-style="western">
              <surname>Alser</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Areia</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Cogenur</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Fister</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Gombar</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Huser</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Jonnagaddala</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Lai</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Leis</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Mateu</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Mayer</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Minty</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Morales</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Natarajan</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Paredes</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Periyakoil</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Prats-Uribe</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Ross</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Singh</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Subbian</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Vivekanantham</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Prieto-Alhambra</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>Characterization of long-term patient-reported symptoms of COVID-19: an analysis of social media data</article-title>
          <source>medRxiv</source>
          <comment>Preprint posted online July 15, 2021. <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.medrxiv.org/content/10.1101/2021.07.13.21260449v1"/>
          </comment>
          <pub-id pub-id-type="doi">10.1101/2021.07.13.21260449</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Matharaarachchi</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Domaratzki</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Katz</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Muthukumarana</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Discovering long COVID symptom patterns: association rule mining and sentiment analysis in social media tweets</article-title>
          <source>JMIR Form Res</source>
          <year>2022</year>
          <month>09</month>
          <day>07</day>
          <volume>6</volume>
          <issue>9</issue>
          <fpage>e37984</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://formative.jmir.org/2022/9/e37984/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/37984</pub-id>
          <pub-id pub-id-type="medline">36069846</pub-id>
          <pub-id pub-id-type="pii">v6i9e37984</pub-id>
          <pub-id pub-id-type="pmcid">PMC9494218</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Sarker</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Lakamana</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Hogg-Bremer</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Xie</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Al-Garadi</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>Y</given-names>
            </name>
          </person-group>
          <article-title>Self-reported COVID-19 symptoms on Twitter: an analysis and a research resource</article-title>
          <source>J Am Med Inform Assoc</source>
          <year>2020</year>
          <month>08</month>
          <day>01</day>
          <volume>27</volume>
          <issue>8</issue>
          <fpage>1310</fpage>
          <lpage>1315</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/32620975"/>
          </comment>
          <pub-id pub-id-type="doi">10.1093/jamia/ocaa116</pub-id>
          <pub-id pub-id-type="medline">32620975</pub-id>
          <pub-id pub-id-type="pii">5867237</pub-id>
          <pub-id pub-id-type="pmcid">PMC7337747</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Guo</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Sisler</surname>
              <given-names>SM</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Wallace</surname>
              <given-names>AS</given-names>
            </name>
          </person-group>
          <article-title>Exploring experiences of COVID-19-positive individuals from social media posts</article-title>
          <source>Int J Nurs Pract</source>
          <year>2021</year>
          <month>10</month>
          <day>14</day>
          <volume>27</volume>
          <issue>5</issue>
          <fpage>e12986</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/34128296"/>
          </comment>
          <pub-id pub-id-type="doi">10.1111/ijn.12986</pub-id>
          <pub-id pub-id-type="medline">34128296</pub-id>
          <pub-id pub-id-type="pmcid">PMC8420411</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref6">
        <label>6</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Jiang</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Zhu</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Bernard</surname>
              <given-names>G</given-names>
            </name>
          </person-group>
          <person-group person-group-type="editor">
            <name name-style="western">
              <surname>Séroussi</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Weber</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Dhombres</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Grouin</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Liebe</surname>
              <given-names>J-D</given-names>
            </name>
            <name name-style="western">
              <surname>Pelayo</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Pinna</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Rance</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Sacchi</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Ugon</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Benis</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Gallos</surname>
              <given-names>P</given-names>
            </name>
          </person-group>
          <article-title>Discovery of COVID-19 symptomatic experience reported by Twitter users</article-title>
          <source>Challenges of Trustable AI and Added-Value on Health (volume 294) &#124; Studies in Health Technology and Informatics</source>
          <year>2022</year>
          <publisher-loc>Amsterdam, Netherlands</publisher-loc>
          <publisher-name>IOS Press</publisher-name>
          <fpage>664</fpage>
          <lpage>668</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref7">
        <label>7</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lwowski</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Rad</surname>
              <given-names>P</given-names>
            </name>
          </person-group>
          <article-title>COVID-19 surveillance through Twitter using self-supervised and few shot learning</article-title>
          <year>2020</year>
          <conf-name>Proceedings of the 1st Workshop on NLP for COVID-19 (Part 2) at EMNLP 2020</conf-name>
          <conf-date>Nov 2020</conf-date>
          <conf-loc>Online</conf-loc>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://aclanthology.org/2020.nlpcovid19-2.9.pdf"/>
          </comment>
          <pub-id pub-id-type="doi">10.18653/v1/2020.nlpcovid19-2.9</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref8">
        <label>8</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Mackey</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Purushothaman</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Shah</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Nali</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Bardier</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Liang</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Cai</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Cuomo</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>Machine learning to detect self-reporting of symptoms, testing access, and recovery associated with COVID-19 on Twitter: retrospective big data infoveillance study</article-title>
          <source>JMIR Public Health Surveill</source>
          <year>2020</year>
          <month>06</month>
          <day>08</day>
          <volume>6</volume>
          <issue>2</issue>
          <fpage>e19509</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://publichealth.jmir.org/2020/2/e19509/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/19509</pub-id>
          <pub-id pub-id-type="medline">32490846</pub-id>
          <pub-id pub-id-type="pii">v6i2e19509</pub-id>
          <pub-id pub-id-type="pmcid">PMC7282475</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref9">
        <label>9</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Devlin</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Cheng</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Toutanova</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>BERT: pre-training of deep bidirectional transformers for language understanding</article-title>
          <year>2019</year>
          <conf-name>Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies</conf-name>
          <conf-date>June 2-7, 2019</conf-date>
          <conf-loc>Minneapolis, MN</conf-loc>
          <fpage>4171</fpage>
          <lpage>4186</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://aclanthology.org/N19-1423.pdf"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref10">
        <label>10</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Müller</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Salathé</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Kummervold</surname>
              <given-names>PE</given-names>
            </name>
          </person-group>
          <article-title>COVID-Twitter-BERT: a natural language processing model to analyse COVID-19 content on Twitter</article-title>
          <source>arXiv</source>
          <comment>Preprint posted online May 15, 2020. <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/2005.07503"/>
          </comment>
          <pub-id pub-id-type="doi">10.48550/arXiv.2005.07503</pub-id>
        </nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
