<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<?covid-19-tdm?>
<article xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article" dtd-version="2.0">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JMIR</journal-id>
      <journal-id journal-id-type="nlm-ta">J Med Internet Res</journal-id>
      <journal-title>Journal of Medical Internet Research</journal-title>
      <issn pub-type="epub">1438-8871</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
      <article-id pub-id-type="publisher-id">v25i1e45767</article-id>
      <article-id pub-id-type="pmid">37725432</article-id>
      <article-id pub-id-type="doi">10.2196/45767</article-id>
      <article-categories>
        <subj-group subj-group-type="heading">
          <subject>Original Paper</subject>
        </subj-group>
        <subj-group subj-group-type="article-type">
          <subject>Original Paper</subject>
        </subj-group>
      </article-categories>
      <title-group>
        <article-title>Using Social Media to Help Understand Patient-Reported Health Outcomes of Post–COVID-19 Condition: Natural Language Processing Approach</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="editor">
          <name>
            <surname>Mavragani</surname>
            <given-names>Amaryllis</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Machinathu Parambil Gangadharan</surname>
            <given-names>Syam</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Xie</surname>
            <given-names>Xiaoxu</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib id="contrib1" contrib-type="author" corresp="yes">
          <name name-style="western">
            <surname>Dolatabadi</surname>
            <given-names>Elham</given-names>
          </name>
          <degrees>BSc, MSc, PhD</degrees>
          <xref rid="aff01" ref-type="aff">1</xref>
          <address>
            <institution>Faculty of Health, School of Health Policy and Management</institution>
            <institution>York University</institution>
            <addr-line>4700 Keele Street</addr-line>
            <addr-line>North York</addr-line>
            <addr-line>Toronto, ON, M3J 1P3</addr-line>
            <country>Canada</country>
            <phone>1 6477069756</phone>
            <email>edolatab@yorku.ca</email>
          </address>
          <xref rid="aff02" ref-type="aff">2</xref>
          <xref rid="aff03" ref-type="aff">3</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-2236-2611</ext-link>
        </contrib>
        <contrib id="contrib2" contrib-type="author">
          <name name-style="western">
            <surname>Moyano</surname>
            <given-names>Diana</given-names>
          </name>
          <degrees>BBA, MMAI</degrees>
          <xref rid="aff02" ref-type="aff">2</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0009-0007-9865-2232</ext-link>
        </contrib>
        <contrib id="contrib3" contrib-type="author">
          <name name-style="western">
            <surname>Bales</surname>
            <given-names>Michael</given-names>
          </name>
          <degrees>BASc, BSc, MBA</degrees>
          <xref rid="aff04" ref-type="aff">4</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0009-0008-5604-2257</ext-link>
        </contrib>
        <contrib id="contrib4" contrib-type="author">
          <name name-style="western">
            <surname>Spasojevic</surname>
            <given-names>Sofija</given-names>
          </name>
          <degrees>BSc, MSc, PhD</degrees>
          <xref rid="aff04" ref-type="aff">4</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-8111-4197</ext-link>
        </contrib>
        <contrib id="contrib5" contrib-type="author">
          <name name-style="western">
            <surname>Bhambhoria</surname>
            <given-names>Rohan</given-names>
          </name>
          <degrees>BTech, MEng, PhD</degrees>
          <xref rid="aff05" ref-type="aff">5</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-2597-670X</ext-link>
        </contrib>
        <contrib id="contrib6" contrib-type="author">
          <name name-style="western">
            <surname>Bhatti</surname>
            <given-names>Junaid</given-names>
          </name>
          <degrees>MSc, PhD</degrees>
          <xref rid="aff06" ref-type="aff">6</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-9540-845X</ext-link>
        </contrib>
        <contrib id="contrib7" contrib-type="author">
          <name name-style="western">
            <surname>Debnath</surname>
            <given-names>Shyamolima</given-names>
          </name>
          <degrees>BSc, MSc</degrees>
          <xref rid="aff07" ref-type="aff">7</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0009-0009-5062-5651</ext-link>
        </contrib>
        <contrib id="contrib8" contrib-type="author">
          <name name-style="western">
            <surname>Hoell</surname>
            <given-names>Nicholas</given-names>
          </name>
          <degrees>BA, PhD</degrees>
          <xref rid="aff07" ref-type="aff">7</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0009-0008-1151-4716</ext-link>
        </contrib>
        <contrib id="contrib9" contrib-type="author">
          <name name-style="western">
            <surname>Li</surname>
            <given-names>Xin</given-names>
          </name>
          <degrees>BASc</degrees>
          <xref rid="aff03" ref-type="aff">3</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0009-0008-1831-7459</ext-link>
        </contrib>
        <contrib id="contrib10" contrib-type="author">
          <name name-style="western">
            <surname>Leng</surname>
            <given-names>Celine</given-names>
          </name>
          <degrees>BBA</degrees>
          <xref rid="aff04" ref-type="aff">4</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-9153-2958</ext-link>
        </contrib>
        <contrib id="contrib11" contrib-type="author">
          <name name-style="western">
            <surname>Nanda</surname>
            <given-names>Sasha</given-names>
          </name>
          <degrees>MSc</degrees>
          <xref rid="aff07" ref-type="aff">7</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0009-0005-2182-1486</ext-link>
        </contrib>
        <contrib id="contrib12" contrib-type="author">
          <name name-style="western">
            <surname>Saab</surname>
            <given-names>Jad</given-names>
          </name>
          <degrees>BACS, MSc</degrees>
          <xref rid="aff08" ref-type="aff">8</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0009-0007-3667-7645</ext-link>
        </contrib>
        <contrib id="contrib13" contrib-type="author">
          <name name-style="western">
            <surname>Sahak</surname>
            <given-names>Esmat</given-names>
          </name>
          <degrees>BASc</degrees>
          <xref rid="aff03" ref-type="aff">3</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0009-0008-3952-7068</ext-link>
        </contrib>
        <contrib id="contrib14" contrib-type="author">
          <name name-style="western">
            <surname>Sie</surname>
            <given-names>Fanny</given-names>
          </name>
          <degrees>BSc, MMI</degrees>
          <xref rid="aff04" ref-type="aff">4</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0009-0008-4944-2447</ext-link>
        </contrib>
        <contrib id="contrib15" contrib-type="author">
          <name name-style="western">
            <surname>Uppal</surname>
            <given-names>Sara</given-names>
          </name>
          <degrees>BEng, MSc</degrees>
          <xref rid="aff08" ref-type="aff">8</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0009-0006-9008-005X</ext-link>
        </contrib>
        <contrib id="contrib16" contrib-type="author">
          <name name-style="western">
            <surname>Vadlamudi</surname>
            <given-names>Nirma Khatri</given-names>
          </name>
          <degrees>MPH, PhD</degrees>
          <xref rid="aff09" ref-type="aff">9</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-5513-7926</ext-link>
        </contrib>
        <contrib id="contrib17" contrib-type="author">
          <name name-style="western">
            <surname>Vladimirova</surname>
            <given-names>Antoaneta</given-names>
          </name>
          <degrees>MSc, PhD</degrees>
          <xref rid="aff10" ref-type="aff">10</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0009-0000-5123-5158</ext-link>
        </contrib>
        <contrib id="contrib18" contrib-type="author">
          <name name-style="western">
            <surname>Yakimovich</surname>
            <given-names>Artur</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff11" ref-type="aff">11</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-2458-4904</ext-link>
        </contrib>
        <contrib id="contrib19" contrib-type="author">
          <name name-style="western">
            <surname>Yang</surname>
            <given-names>Xiaoxue</given-names>
          </name>
          <degrees>HBSc, MSc</degrees>
          <xref rid="aff07" ref-type="aff">7</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0009-0004-9059-9017</ext-link>
        </contrib>
        <contrib id="contrib20" contrib-type="author">
          <name name-style="western">
            <surname>Kocak</surname>
            <given-names>Sedef Akinli</given-names>
          </name>
          <degrees>BS, MSc, PhD</degrees>
          <xref rid="aff02" ref-type="aff">2</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0009-0009-6271-415X</ext-link>
        </contrib>
        <contrib id="contrib21" contrib-type="author">
          <name name-style="western">
            <surname>Cheung</surname>
            <given-names>Angela M</given-names>
          </name>
          <degrees>BA, MD, PhD</degrees>
          <xref rid="aff03" ref-type="aff">3</xref>
          <xref rid="aff12" ref-type="aff">12</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-8332-0744</ext-link>
        </contrib>
      </contrib-group>
      <aff id="aff01">
        <label>1</label>
        <institution>Faculty of Health, School of Health Policy and Management</institution>
        <institution>York University</institution>
        <addr-line>Toronto, ON</addr-line>
        <country>Canada</country>
      </aff>
      <aff id="aff02">
        <label>2</label>
        <institution>Vector Institute</institution>
        <addr-line>Toronto, ON</addr-line>
        <country>Canada</country>
      </aff>
      <aff id="aff03">
        <label>3</label>
        <institution>Department of Medicine and Joint Department of Medical Imaging, University of Toronto</institution>
        <addr-line>Toronto, ON</addr-line>
        <country>Canada</country>
      </aff>
      <aff id="aff04">
        <label>4</label>
        <institution>Hoffmann-La Roche Ltd</institution>
        <addr-line>Toronto, ON</addr-line>
        <country>Canada</country>
      </aff>
      <aff id="aff05">
        <label>5</label>
        <institution>Electrical and Computer Engineering, Queen’s University</institution>
        <addr-line>Kingston, ON</addr-line>
        <country>Canada</country>
      </aff>
      <aff id="aff06">
        <label>6</label>
        <institution>Manulife</institution>
        <addr-line>Toronto, ON</addr-line>
        <country>Canada</country>
      </aff>
      <aff id="aff07">
        <label>7</label>
        <institution>Deloitte</institution>
        <addr-line>Toronto, ON</addr-line>
        <country>Canada</country>
      </aff>
      <aff id="aff08">
        <label>8</label>
        <institution>TELUS Health</institution>
        <addr-line>Montreal, QC</addr-line>
        <country>Canada</country>
      </aff>
      <aff id="aff09">
        <label>9</label>
        <institution>Department of Pediatrics, Faculty of Medicine, University of British Columbia</institution>
        <addr-line>Vancouver, BC</addr-line>
        <country>Canada</country>
      </aff>
      <aff id="aff10">
        <label>10</label>
        <institution>Roche Information Solutions</institution>
        <addr-line>San Francisco, CA</addr-line>
        <country>United States</country>
      </aff>
      <aff id="aff11">
        <label>11</label>
        <institution>Hoffmann-La Roche Ltd</institution>
        <addr-line>Munich</addr-line>
        <country>Germany</country>
      </aff>
      <aff id="aff12">
        <label>12</label>
        <institution>University Health Network</institution>
        <addr-line>Toronto, ON</addr-line>
        <country>Canada</country>
      </aff>
      <author-notes>
        <corresp>Corresponding Author: Elham Dolatabadi <email>edolatab@yorku.ca</email></corresp>
      </author-notes>
      <pub-date pub-type="collection">
        <year>2023</year>
      </pub-date>
      <pub-date pub-type="epub">
        <day>19</day>
        <month>9</month>
        <year>2023</year>
      </pub-date>
      <volume>25</volume>
      <elocation-id>e45767</elocation-id>
      <history>
        <date date-type="received">
          <day>16</day>
          <month>1</month>
          <year>2023</year>
        </date>
        <date date-type="rev-request">
          <day>23</day>
          <month>2</month>
          <year>2023</year>
        </date>
        <date date-type="rev-recd">
          <day>18</day>
          <month>5</month>
          <year>2023</year>
        </date>
        <date date-type="accepted">
          <day>5</day>
          <month>6</month>
          <year>2023</year>
        </date>
      </history>
      <copyright-statement>©Elham Dolatabadi, Diana Moyano, Michael Bales, Sofija Spasojevic, Rohan Bhambhoria, Junaid Bhatti, Shyamolima Debnath, Nicholas Hoell, Xin Li, Celine Leng, Sasha Nanda, Jad Saab, Esmat Sahak, Fanny Sie, Sara Uppal, Nirma Khatri Vadlamudi, Antoaneta Vladimirova, Artur Yakimovich, Xiaoxue Yang, Sedef Akinli Kocak, Angela M Cheung. Originally published in the Journal of Medical Internet Research (https://www.jmir.org), 19.09.2023.</copyright-statement>
      <copyright-year>2023</copyright-year>
      <license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
        <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (https://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in the Journal of Medical Internet Research, is properly cited. The complete bibliographic information, a link to the original publication on https://www.jmir.org/, as well as this copyright and license information must be included.</p>
      </license>
      <self-uri xlink:href="https://www.jmir.org/2023/1/e45767" xlink:type="simple"/>
      <related-article related-article-type="correction-forward" xlink:title="This is a corrected version. See correction statement in:" xlink:href="https://www.jmir.org/2023/1/e55010" vol="25" page="e55010"> </related-article>
      <abstract>
        <sec sec-type="background">
          <title>Background</title>
          <p>While scientific knowledge of post–COVID-19 condition (PCC) is growing, there remains significant uncertainty in the definition of the disease, its expected clinical course, and its impact on daily functioning. Social media platforms can generate valuable insights into patient-reported health outcomes as the content is produced at high resolution by patients and caregivers, representing experiences that may be unavailable to most clinicians.</p>
        </sec>
        <sec sec-type="objective">
          <title>Objective</title>
          <p>In this study, we aimed to determine the validity and effectiveness of advanced natural language processing approaches built to derive insight into PCC-related patient-reported health outcomes from social media platforms Twitter and Reddit. We extracted PCC-related terms, including symptoms and conditions, and measured their occurrence frequency. We compared the outputs with human annotations and clinical outcomes and tracked symptom and condition term occurrences over time and locations to explore the pipeline’s potential as a surveillance tool.</p>
        </sec>
        <sec sec-type="methods">
          <title>Methods</title>
          <p>We used bidirectional encoder representations from transformers (BERT) models to extract and normalize PCC symptom and condition terms from English posts on Twitter and Reddit. We compared 2 named entity recognition models and implemented a 2-step normalization task to map extracted terms to unique concepts in standardized terminology. The normalization steps were done using a semantic search approach with BERT biencoders. We evaluated the effectiveness of BERT models in extracting the terms using a human-annotated corpus and a proximity-based score. We also compared the validity and reliability of the extracted and normalized terms to a web-based survey with more than 3000 participants from several countries.</p>
        </sec>
        <sec sec-type="results">
          <title>Results</title>
          <p>UmlsBERT-Clinical had the highest accuracy in predicting entities closest to those extracted by human annotators. Based on our findings, the top 3 most commonly occurring groups of PCC symptom and condition terms were systemic (such as <italic>fatigue</italic>), neuropsychiatric (such as <italic>anxiety</italic> and <italic>brain fog</italic>), and respiratory (such as <italic>shortness of breath</italic>). In addition, we also found novel symptom and condition terms that had not been categorized in previous studies, such as <italic>infection</italic> and <italic>pain</italic>. Regarding the co-occurring symptoms, the pair of <italic>fatigue</italic> and <italic>headaches</italic> was among the most co-occurring term pairs across both platforms. Based on the temporal analysis, the neuropsychiatric terms were the most prevalent, followed by the systemic category, on both social media platforms. Our spatial analysis concluded that 42% (10,938/26,247) of the analyzed terms included location information, with the majority coming from the United States, United Kingdom, and Canada.</p>
        </sec>
        <sec sec-type="conclusions">
          <title>Conclusions</title>
          <p>The outcome of our social media–derived pipeline is comparable with the results of peer-reviewed articles relevant to PCC symptoms. Overall, this study provides unique insights into patient-reported health outcomes of PCC and valuable information about the patient’s journey that can help health care providers anticipate future needs.</p>
        </sec>
        <sec sec-type="registered-report">
          <title>International Registered Report Identifier (IRRID)</title>
          <p>RR2-10.1101/2022.12.14.22283419</p>
        </sec>
      </abstract>
      <kwd-group>
        <kwd>long COVID</kwd>
        <kwd>post–COVID-19 condition</kwd>
        <kwd>PCC</kwd>
        <kwd>social media</kwd>
        <kwd>natural language processing</kwd>
        <kwd>transformer models</kwd>
        <kwd>bidirectional encoder representations from transformers</kwd>
        <kwd>machine learning</kwd>
        <kwd>Twitter</kwd>
        <kwd>Reddit</kwd>
        <kwd>PRO</kwd>
        <kwd>patient-reported outcome</kwd>
        <kwd>patient-reported symptom</kwd>
        <kwd>health outcome</kwd>
        <kwd>symptom</kwd>
        <kwd>entity extraction</kwd>
        <kwd>entity normalization</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <sec sec-type="introduction">
      <title>Introduction</title>
      <p>Postacute sequelae of SARS-CoV-2, known as post–COVID-19 condition (PCC) or colloquially, long COVID, are broadly defined as delayed recovery from infection with SARS-CoV-2. PCC can occur following severe, mild, or even asymptomatic SARS-CoV-2 infection [<xref ref-type="bibr" rid="ref1">1</xref>]. Patients with PCC experience lingering or episodic symptoms for greater than 12 weeks or 3 months after acute infection [<xref ref-type="bibr" rid="ref1">1</xref>,<xref ref-type="bibr" rid="ref2">2</xref>]. Despite a growing interest in characterizing clinical manifestations of PCC, no standard framework has yet been established [<xref ref-type="bibr" rid="ref3">3</xref>,<xref ref-type="bibr" rid="ref4">4</xref>]. Symptoms of PCC are extremely heterogeneous and their assessment varies widely among studies.</p>
      <p>Multiple pioneering efforts have investigated symptoms of PCC for hospitalized individuals, representing the minority of people with COVID-19 [<xref ref-type="bibr" rid="ref5">5</xref>-<xref ref-type="bibr" rid="ref7">7</xref>]. An extensive patient-led survey has been conducted in an outpatient setting to explore the symptoms of PCC over 7 months [<xref ref-type="bibr" rid="ref3">3</xref>]. The survey analyzed numerous symptoms of PCC from 3762 confirmed (or suspected) patients with COVID-19 and attributed them to 10 organ systems. To our knowledge, this study is one of the most highly cited papers providing insights into PCC for researchers and clinicians. However, as the authors note, a limitation of the study is the existence of a sampling bias toward patients with PCC who opted to participate in the study. The authors recommend greater outreach with diverse groups of patients is needed to counter sample bias and better characterize the PCC phenomenon. This shortcoming motivates the proposed social medial approach to improve understanding of PCC by filling the information gaps in a more diverse patient population.</p>
      <p>The rise of social media platforms has provided researchers and public agencies an unprecedented opportunity to gain insight into personal and population health experiences outside traditional health care settings [<xref ref-type="bibr" rid="ref8">8</xref>-<xref ref-type="bibr" rid="ref11">11</xref>]. Global content on social media platforms is consistently expanding with users expected to increase to 4.4 billion individuals by 2025 [<xref ref-type="bibr" rid="ref12">12</xref>]. With an appropriate data analytics approach, these data have proven useful in generating insights into emerging health conditions, as seen with Ebola virus [<xref ref-type="bibr" rid="ref13">13</xref>], Zika virus [<xref ref-type="bibr" rid="ref14">14</xref>], and foodborne disease [<xref ref-type="bibr" rid="ref15">15</xref>]. Numerous studies have used Twitter and Reddit posts as valuable resources for studying public health measures, the evolution of new medical conditions [<xref ref-type="bibr" rid="ref16">16</xref>-<xref ref-type="bibr" rid="ref18">18</xref>], and exploring populations’ health during and after COVID-19 [<xref ref-type="bibr" rid="ref19">19</xref>-<xref ref-type="bibr" rid="ref23">23</xref>].</p>
      <p>A close examination of prior social media studies demonstrates that generating valuable insights (ie, clinical symptoms) from social media platforms requires complex and well-designed natural language processing (NLP) approaches [<xref ref-type="bibr" rid="ref15">15</xref>,<xref ref-type="bibr" rid="ref24">24</xref>,<xref ref-type="bibr" rid="ref25">25</xref>]. NLP, so far, has had a significant impact on COVID-19 research and response efforts; NLP techniques have been used to predict the initial spread of COVID-19 quicker than public health [<xref ref-type="bibr" rid="ref26">26</xref>], extract COVID-19 content from social media posts [<xref ref-type="bibr" rid="ref24">24</xref>,<xref ref-type="bibr" rid="ref27">27</xref>], and detect COVID-19 misinformation videos on YouTube [<xref ref-type="bibr" rid="ref28">28</xref>]. In a recent study, NLP was implemented to characterize a broad set of COVID-19 signs and symptoms from medical records, with enhanced detail and timeliness [<xref ref-type="bibr" rid="ref29">29</xref>]. Altogether, these findings provide clear evidence that NLP can strengthen PCC surveillance and help researchers and public health officials understand the public perceptions and attitudes toward the long-term impact of COVID-19 infection on physical and psychological health. Despite all these promising points, adopting and integrating these types of patient-generated data into broader health research and services requires a comprehensive evaluation of the data and the NLP model. This evaluation task is laborious due to the lack of a “ground truth” from social media data, requiring the pooling of resources from related literature and engaging subject matter experts [<xref ref-type="bibr" rid="ref18">18</xref>,<xref ref-type="bibr" rid="ref30">30</xref>,<xref ref-type="bibr" rid="ref31">31</xref>].</p>
      <p>In response to the emergence of PCC, we developed an NLP pipeline as shown in <xref rid="figure1" ref-type="fig">Figure 1</xref> to facilitate extracting information from user-reported experiences in social media platforms [<xref ref-type="bibr" rid="ref32">32</xref>]. In this study, we examined the validity and effectiveness of our NLP pipeline to provide insights into patient-reported PCC-related health outcomes across 2 popular social media platforms, Twitter and Reddit. In doing so, we extracted symptoms and conditions and estimated their occurrence frequency. We compared the outputs with human annotations and highly used clinical outcomes grounded in the medical literature. Lastly, we tracked occurrences of symptom and condition terms over time and geographies to explore the pipeline’s potential to be used as a surveillance tool reflecting users’ opinions and experiences.</p>
      <fig id="figure1" position="float">
        <label>Figure 1</label>
        <caption>
          <p>Illustration of implementation of an end-to-end natural language processing pipeline for extracting information from user-reported experiences in the social media platforms Twitter and Reddit. The data preprocessing step in the pipeline includes self-report extraction and location information inference. Next in the pipeline is the extraction and 2-step normalization of post–COVID-19 condition terms. UmlsBERT-Clinical is used for term extraction tasks. The first step of normalization involves mapping terms to their common base forms. The second step of normalization involves mapping from base forms to unique concepts derived from the post–COVID-19 condition survey. API: application programming interface; MNLI: multi-genre natural language inference; RegEx: regular expression approach.</p>
        </caption>
        <graphic xlink:href="jmir_v25i1e45767_fig1.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
      </fig>
    </sec>
    <sec sec-type="methods">
      <title>Methods</title>
      <sec>
        <title>Data Collection</title>
        <p>We used application programming interfaces (APIs) to collect data from the 2 social media platforms for the period from 2019 until 2021. We hashed all usernames and removed URLs through the deidentification process. To further pseudonymize the data, we transformed special characters in the tweets or Reddit posts to lowercase and extracted contractions. <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> and our previous work describe more details about the data collection and preprocessing steps [<xref ref-type="bibr" rid="ref32">32</xref>].</p>
        <p>For Twitter searches, we chose relevant hashtags (eg, #longcovid, #covidlong, or #longhauler) and words included in tweets (eg, <italic>long-hauler</italic>, <italic>chronic symptoms</italic>, and <italic>long-term effects</italic>). We excluded retweets, replies, quotes, and nullcast from the data set, as well as any tweets that were not in English. The list of hashtags is shown in Table S1 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>. Our Twitter data set’s average character length and word count were 133 and 23, respectively. For the Reddit data, we targeted specific subreddits such as r/covidlonghaulers that were appropriate to the PCC topic. In contrast to the short length of tweets (up to 280 characters), Reddit posts can have up to 40,000 characters, and the longest Reddit post in our data set had 17,060 characters.</p>
      </sec>
      <sec>
        <title>Data Preprocessing</title>
        <sec>
          <title>Self-Report Extraction</title>
          <p>For this study we were primarily interested in posts containing self-reported medical symptoms. We therefore excluded individual posts in our corpus with other purposes at the time of posting, such as disseminating news and sharing ideas and opinions. We used a combination of the regular expression approach (RegEx) and transformer-based bidirectional encoder representations from transformers (BERT) classifier to distinguish between posts of self-reports (explaining personal health status) and other posts, including opinions or news reports. The RegEx approach relies on personal pronouns, as shown in <xref ref-type="table" rid="table1">Table 1</xref>. The classifier, which we call MNLI classifier, is a natural language inference model—the COVID-Twitter BERT (version 2), fine-tuned on the MultiNLI data set [<xref ref-type="bibr" rid="ref24">24</xref>]. The classifier annotates posts as either self-report (annotated as “1”) or not (annotated as “0”) and poses the candidate labels (eg, <italic>my experience</italic>) as either “premise” or “hypothesis.” We ran the RegEx model and the combination of RegEx and MNLI classifier (RegEx+MNLI classifier) on the entire Twitter data set. We saved the scores, with the averaged performances shown in <xref ref-type="table" rid="table2">Table 2</xref>. The standalone RegEx filter outperformed the RegEx+MNLI classifier without substantial loss in precision, as reflected by the <italic>F</italic><sub>1</sub>-score (<xref ref-type="table" rid="table2">Table 2</xref>). Although the proportion of self-reports is much higher on Reddit, the same self-filtration approach was applied to Reddit posts for consistency.</p>
          <table-wrap position="float" id="table1">
            <label>Table 1</label>
            <caption>
              <p>Self-report extraction. Pronouns and respective regular expression (RegEx) were used for extraction of self-reports from social media posts.</p>
            </caption>
            <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
              <col width="290"/>
              <col width="710"/>
              <thead>
                <tr valign="top">
                  <td>Pronoun captured</td>
                  <td>Regular expression code</td>
                </tr>
              </thead>
              <tbody>
                <tr valign="top">
                  <td>I</td>
                  <td>
                    <inline-graphic xlink:href="jmir_v25i1e45767_fig6.png" xlink:type="simple" mimetype="image"/>
                  </td>
                </tr>
                <tr valign="top">
                  <td>Me</td>
                  <td>
                    <inline-graphic xlink:href="jmir_v25i1e45767_fig7.png" xlink:type="simple" mimetype="image"/>
                  </td>
                </tr>
              </tbody>
            </table>
          </table-wrap>
          <table-wrap position="float" id="table2">
            <label>Table 2</label>
            <caption>
              <p>Self-report extraction. The performance results of the self-report filters on the Twitter data set—RegEx+MNLI classifier is the combination of RegExa and COVID-Twitter BERT (version 2) fine-tuned on the MultiNLI data set.</p>
            </caption>
            <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
              <col width="370"/>
              <col width="120"/>
              <col width="120"/>
              <col width="90"/>
              <col width="300"/>
              <thead>
                <tr valign="top">
                  <td>Approach</td>
                  <td>Accuracy</td>
                  <td>Precision</td>
                  <td>Recall</td>
                  <td>
                    <italic>F</italic>
                    <sub>1</sub>
                  </td>
                </tr>
              </thead>
              <tbody>
                <tr valign="top">
                  <td>RegEx<sup>a</sup>+MNLI<sup>b</sup> classifier</td>
                  <td>0.83<sup>c</sup></td>
                  <td>0.79<sup>c</sup></td>
                  <td>0.71</td>
                  <td>0.74</td>
                </tr>
                <tr valign="top">
                  <td>RegEx</td>
                  <td>0.78</td>
                  <td>0.75</td>
                  <td>0.83<sup>c</sup></td>
                  <td>0.76<sup>c</sup></td>
                </tr>
              </tbody>
            </table>
            <table-wrap-foot>
              <fn id="table2fn1">
                <p><sup>a</sup>RegEx: regular expression.</p>
              </fn>
              <fn id="table2fn2">
                <p><sup>b</sup>MNLI: natural language inference model.</p>
              </fn>
              <fn id="table2fn3">
                <p><sup>c</sup>The highest value in each column.</p>
              </fn>
            </table-wrap-foot>
          </table-wrap>
        </sec>
        <sec>
          <title>Location Information Inference for Twitter Data</title>
          <p>For the Twitter data set, we used the open source Nominatim API [<xref ref-type="bibr" rid="ref33">33</xref>] to infer detailed location information such as city, region, and country from user-defined location information attached to tweets. The API uses OpenStreetMap (OpenStreetMap Foundation) data to make predictions. In cases where user-defined location information could allude to multiple locations (eg, London is a city in both Canada and England), Nominatim API returns the most likely location based on its criteria. For the Reddit data set, location information was not available.</p>
        </sec>
      </sec>
      <sec>
        <title>Extraction and Normalization of PCC Symptoms and Conditions</title>
        <p>Transformer-based BERT models were mainly used to extract and normalize PCC symptoms and conditions. Specifically, we used and compared 2 named entity recognition (NER) models, including UmlsBERT-Clinical [<xref ref-type="bibr" rid="ref32">32</xref>] and the Stanza clinical NER (Stanza-Clinical). UmlsBERT-Clinical model is an UmlsBERT NER fine-tuned on the n2c2 (2010) data set. Stanza-Clinical is a publicly available NER from the Stanford NLP group. Given the BERT models’ input length limit, we used rolling windows to segment Reddit posts into smaller chunks with less than 1024 characters.</p>
        <p>Following the symptom and condition term extraction task, we implemented a 2-step normalization task, as shown in <xref rid="figure1" ref-type="fig">Figure 1</xref>, to analyze our findings, reduce inflectional forms, and compare them with existing works. The 2-step normalization task maps extracted terms to unique concepts in a standardized terminology. In the first step of normalization, we mapped each extracted raw term into its common base form. For instance, <italic>my tiredness</italic>, <italic>real tiredness</italic>, <italic>very tired,</italic> and <italic>chronic tiredness</italic> were normalized to <italic>tired.</italic> For this task, all extracted terms (eg, <italic>my</italic> <italic>tiredness</italic>) were tokenized, tagged, and clustered into nouns (eg, <italic>tired</italic>), pronouns (eg, <italic>my</italic>), or suffixes (eg, <italic>ness</italic>). Then, common base forms for terms were built through a manual review of all the nouns. In the second step of normalization, we transformed the terms in their common base form into their corresponding unique concept in a standardized terminology, for example, <italic>tired</italic> is mapped to <italic>fatigue</italic>. The standard terminologies were derived from a highly cited and used PCC research paper which gathered 203 symptoms from 3762 patients with PCC through a web-based survey [<xref ref-type="bibr" rid="ref3">3</xref>]. We, therefore, used the 203 symptoms as the standardized unique concept for the second step in the normalization procedure.</p>
        <p>The conversion between extracted raw terms from social media and either the common base form or the 203 unique concepts was done using a semantic search approach with BERT biencoders [<xref ref-type="bibr" rid="ref34">34</xref>]. Using this approach, we first created embeddings for all the extracted raw symptom and condition terms. Then, we retrieved the top common bases or unique concepts with high semantic overlap with the raw terms at the search time. Following a manual review of the retrieved pair, we set a cutoff threshold where pairs with similarity scores greater than the threshold were stored as the match and included in our analysis. For the rest of this paper, <italic>mapped</italic> terms refer to the raw symptom and condition terms mapped to their common base in the first step of the normalization process. In addition, <italic>normalized</italic> terms refers to normalized symptom and condition terms further transformed to the 203 standardized unique concepts derived from 3762 patients with PCC.</p>
      </sec>
      <sec>
        <title>Evaluation</title>
        <p>To determine how good the BERT models are at extracting the symptom and condition terms, we created a human-annotated corpus from Twitter. We established a proximity-based score to measure potential overlap. Our in-house human-annotated Twitter corpus includes 200 randomly sampled tweets annotated by 4 trained annotators. The proximity-based score was calculated by dividing the intersection of extracted entities by the union of extracted terms, with duplicated extracted entities removed for annotators and the model. The closer the proximity metric to 0, the closer the model’s predictions to the human-annotated benchmark. Validity and reliability of the extracted and normalized symptoms, a subset of symptom and condition terms, were evaluated compared with a web-based survey including 3762 participants from 56 countries [<xref ref-type="bibr" rid="ref3">3</xref>,<xref ref-type="bibr" rid="ref35">35</xref>,<xref ref-type="bibr" rid="ref36">36</xref>].</p>
      </sec>
      <sec>
        <title>Ethical Considerations</title>
        <p>Several ethical considerations were examined during this study. The initial consideration pertained to whether the ethical standards were met when using social media data [<xref ref-type="bibr" rid="ref37">37</xref>]. It is important to note that Ethics approval was not pursued for this study. In accordance with our institution's ethical perspective, the analysis of deidentified public data for trend and insight generation is deemed acceptable. This is on the condition that no individual-level data is disclosed and the risk of reidentification remains minimal. Deidentification measures were implemented at the beginning of the data set development process in accordance with our institution's viewpoint, and the updating of the data set made available for analysis has been restricted. As a measure to protect privacy, hashing was applied to all usernames and mentions during the deidentification process. Moreover, our research does not involve any direct interaction with social media users. Additionally, all the project participants have signed a data user agreement that restricts access and usage of the data for the sole purpose of use in this scientific research and not for any other purpose.</p>
      </sec>
    </sec>
    <sec sec-type="results">
      <title>Results</title>
      <sec>
        <title>Data Overview</title>
        <p><xref ref-type="table" rid="table3">Table 3</xref> lists the statistics of the English-language posts collected from Twitter and Reddit. In addition to posts, we gathered the timestamp, geographical coordinates (if available), user’s location (user-defined), and user’s profile description for each tweet. A total of 107 countries were represented in our Twitter sample; most respondents tweeted from the United States (4850/10,938, 44.3%), followed by the United Kingdom (4316/10,938, 39.4%) and Canada (631/10,938, 5.8%).</p>
        <table-wrap position="float" id="table3">
          <label>Table 3</label>
          <caption>
            <p>Statistics of Twitter and Reddit data.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="90"/>
            <col width="210"/>
            <col width="100"/>
            <col width="100"/>
            <col width="0"/>
            <col width="80"/>
            <col width="80"/>
            <col width="80"/>
            <col width="0"/>
            <col width="120"/>
            <col width="0"/>
            <col width="140"/>
            <thead>
              <tr valign="top">
                <td>Platform</td>
                <td>Period</td>
                <td colspan="3">Posts, n</td>
                <td colspan="4">Raw terms<sup>a</sup>, n</td>
                <td colspan="2">Total<sup>b</sup> mapped terms<sup>a</sup>, n</td>
                <td>Total<sup>b</sup> normalized terms<sup>a</sup>, n</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>
                  <break/>
                </td>
                <td>Before filter<sup>c</sup></td>
                <td>After filter<sup>c</sup></td>
                <td colspan="4">Posts<sup>d</sup></td>
                <td colspan="2">Unique<sup>e</sup></td>
                <td colspan="2">Total<sup>c</sup></td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>Twitter</td>
                <td>August 2019 to June 2021</td>
                <td>466,651</td>
                <td>84,621</td>
                <td colspan="2">28,202</td>
                <td>22,451</td>
                <td>52,806</td>
                <td colspan="2">33,175</td>
                <td colspan="2">26,247</td>
              </tr>
              <tr valign="top">
                <td>Reddit</td>
                <td>July 2020 to September 2021</td>
                <td>191,526</td>
                <td>129,917</td>
                <td colspan="2">128,820</td>
                <td>92,816</td>
                <td>357,887</td>
                <td colspan="2">243,342</td>
                <td colspan="2">209,193</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table3fn1">
              <p><sup>a</sup>Extracted symptom and condition terms using UmlsBERT</p>
            </fn>
            <fn id="table3fn2">
              <p><sup>b</sup>Total denotes the total occurrence counts of extracted symptom and condition terms.</p>
            </fn>
            <fn id="table3fn3">
              <p><sup>c</sup>The filter refers to the self-report filter.</p>
            </fn>
            <fn id="table3fn4">
              <p><sup>d</sup>Posts denotes the count of posts with at least 1 extracted symptom and condition term.</p>
            </fn>
            <fn id="table3fn5">
              <p><sup>e</sup>Unique denotes the counts of unique extracted symptom and condition terms.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
      </sec>
      <sec>
        <title>Evaluation of Extraction of Symptom and Condition Terms</title>
        <p><xref ref-type="table" rid="table4">Table 4</xref> compares the performance of UmlsBERT-Clinical and Stanza-Clinical for the entity extraction task from Twitter data. Both models’ performances are compared against human annotators regarding the proximity score. As shown in the table, we also included entity extraction results by the data augmentation approach UMLS MetaMap (+American Medical Informatics Association; AMIA) introduced in our earlier work [<xref ref-type="bibr" rid="ref32">32</xref>]. UMLS MetaMap (+AMIA) uses the MetaMapLite tool to extract entities associating with UMLS’ concept unique identifiers and augments the results with a manually annotated data set consisting of clinical concepts and colloquial expressions (eg, <italic>brain fog</italic>) from tweets. Based on the results, UMLS MetaMap (+AMIA) tends to capture more entities than human annotators; however, some may not be as relevant or provide sufficient insight to experts. Stanza tends to capture fewer entities than human annotators. Consequently, there is the risk of missing information that subject matter experts may consider relevant. UmlsBERT-Clinical has the lowest sum of absolute values for the proximity-based evaluation metric (0.28), indicating predictions closest to those extracted by human annotators from the sample. Hereafter, the rest of the analysis is based on using UmlsBERT-Clinical for the extraction task due to its better overall performance than Stanza.</p>
        <table-wrap position="float" id="table4">
          <label>Table 4</label>
          <caption>
            <p>Proximity score. The comparison results of the proximity score of human evaluation on 200 tweets were identified by trained annotators with outputs from UmlsBERT-Clinical, Stanza-Clinical, and MetaMap +AMIA.<sup>a</sup> Annotators 3 and 4 have medical backgrounds. The comparison values are based on the proximity-based evaluation metric, defined as the difference between the annotator’s and model’s match counts. The closer the proximity metric to 0, the closer the model’s predictions to the ground truth (human annotator).</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="300"/>
            <col width="200"/>
            <col width="190"/>
            <col width="310"/>
            <thead>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>UmlsBERT-Clinical [<xref ref-type="bibr" rid="ref32">32</xref>]</td>
                <td>MetaMap+AMIA [<xref ref-type="bibr" rid="ref32">32</xref>]</td>
                <td>Stanza-Clinical</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>Annotator 1</td>
                <td>−0.13</td>
                <td>−0.46</td>
                <td>0.33</td>
              </tr>
              <tr valign="top">
                <td>Annotator 2</td>
                <td>−0.02</td>
                <td>−0.29</td>
                <td>0.33</td>
              </tr>
              <tr valign="top">
                <td>Annotator 3</td>
                <td>0</td>
                <td>−0.31</td>
                <td>0.51</td>
              </tr>
              <tr valign="top">
                <td>Annotator 4</td>
                <td>0.13</td>
                <td>−0.17</td>
                <td>0.48</td>
              </tr>
              <tr valign="top">
                <td>Sum of the absolute values</td>
                <td>0.28</td>
                <td>1.23</td>
                <td>1.65</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table4fn1">
              <p><sup>a</sup>AMIA: American Medical Informatics Association</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
      </sec>
      <sec>
        <title>Occurrence Frequency Estimation at Any Point in Time</title>
        <sec>
          <title>Overview</title>
          <p>The occurrence frequency of normalized symptom and condition terms at any point in time is shown in <xref rid="figure2" ref-type="fig">Figure 2</xref>. <xref rid="figure2" ref-type="fig">Figure 2</xref>A and <xref rid="figure2" ref-type="fig">Figure 2</xref>B depict the occurrence frequency of the normalized terms, and <xref rid="figure2" ref-type="fig">Figure 2</xref>C illustrates the occurrence frequency of mapped terms. The normalized terms were further categorized by the affected organ systems, similar to the survey study [<xref ref-type="bibr" rid="ref3">3</xref>], and the aggregated occurrence frequency per each category is shown in <xref rid="figure2" ref-type="fig">Figure 2</xref>B.</p>
          <fig id="figure2" position="float">
            <label>Figure 2</label>
            <caption>
              <p>The occurrence frequency of the most prevailing extracted symptom and condition terms in Twitter and Reddit data with occurrence frequency greater than 1% (n&gt;350 for Twitter, and n&gt;4000 for Reddit). Normalized terms are the raw terms that were normalized (after a 2-step normalization process, as shown in Figure 1) to the 203 standardized unique concepts derived from a web-based survey of 3762 patients with post–COVID-19 condition [<xref ref-type="bibr" rid="ref3">3</xref>]. For instance, “my tiredness” is normalized into “fatigue.” Grouped terms are the normalized terms that were further categorized based on the affected organ system established by Davis et al [<xref ref-type="bibr" rid="ref3">3</xref>]. Novel terms are the mapped terms that we had not normalized to the 203 standardized unique concepts because they were neither reported nor categorized in the survey study [<xref ref-type="bibr" rid="ref3">3</xref>]. HEENT: head, eyes, ears, nose, and throat.</p>
            </caption>
            <graphic xlink:href="jmir_v25i1e45767_fig2.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </fig>
        </sec>
        <sec>
          <title>Comparison of Social Media to the Survey Study</title>
          <p>Systemic and neuropsychiatric symptoms were evidenced as the top-occurring symptoms in our study and the survey study. Fatigue (Twitter: 3434/33,175; Reddit: 3486/243,342; so n~3400, 10.3%) appeared as the most frequent symptom on both Twitter and Reddit. On Twitter (n=33,175), the top 5 most occurring symptoms also included anxiety (n=1433, 4.8%), shortness of breath (n=1136, 3.5%), brain fog (n=1072, 3.4%), and depression (n=803, 2.7%), whereas for Reddit (n=243,342), they also included anxiety (n=3984, 4.6%), tachycardia (n=3016, 4.3%), brain fog (n=3521, 3.6%), and shortness of breath (n=1889, 2.7%). This observation is in line with the survey study [<xref ref-type="bibr" rid="ref3">3</xref>], where fatigue, breathing issues, and cognitive dysfunction (eg, depression and anxiety) were reported by patients as the top 3 most debilitating symptoms. Other terms, including <italic>headaches</italic>, <italic>dizziness</italic>, <italic>pain</italic>, <italic>burning in the chest</italic>, <italic>fever</italic>, <italic>nausea</italic>, <italic>dry cough</italic>, and <italic>neuralgia</italic> appeared in the top 20 occurring terms in both Twitter and Reddit data albeit with slightly different prevalence (<xref rid="figure1" ref-type="fig">Figure 1</xref>A). <italic>Immunologic/autoimmune</italic>, <italic>dermatology</italic>, and <italic>reproductive/genitourinary/endocrine</italic> were the lowest 3 categories of terms across all 3 sources—Twitter, Reddit, and the survey study.</p>
        </sec>
        <sec>
          <title>Discovery of Novel Conditions</title>
          <p>Novel symptom and condition terms shown in <xref rid="figure2" ref-type="fig">Figure 2</xref>C are neither reported nor categorized in the survey study [<xref ref-type="bibr" rid="ref3">3</xref>]. Among novel terms, <italic>infection</italic> and <italic>pain</italic> are the top 2 reported conditions. Other terms, including <italic>flu</italic>, <italic>organ damage</italic>, <italic>hair loss</italic>, <italic>weight loss</italic>, <italic>dementia</italic>, <italic>parasomnia</italic>, <italic>pneumonia</italic>, <italic>dysautonomia</italic>, <italic>kidney issues</italic>, and <italic>chronic obstructive pulmonary disease</italic> were among the top 1% (Twitter: n=500; Reddit: n=700) of reported terms.</p>
        </sec>
      </sec>
      <sec>
        <title>Co-Occurrence Frequency Estimation at Any Point</title>
        <p><xref rid="figure3" ref-type="fig">Figure 3</xref> shows how often pairs of PCC-normalized symptom and condition terms co-occur in both Twitter and Reddit. As expected, the co-occurrence map is more “dense” for the Reddit data (<xref rid="figure3" ref-type="fig">Figure 3</xref>A) than the Twitter data (<xref rid="figure3" ref-type="fig">Figure 3</xref>B). Since Reddit posts are significantly longer than tweets, they contain more contextual information and repeated symptom and condition terms. Based on the results, the pair of <italic>fatigue</italic> and <italic>headaches</italic> was among the most co-occurring terms across both platforms. In addition, for Twitter data, the pairs of <italic>fatigue</italic> and <italic>shortness of breath</italic>, <italic>fatigue</italic> and <italic>migraines</italic>, <italic>fatigue</italic> and <italic>general pain</italic>, <italic>fatigue</italic> <italic>and hair loss</italic>, <italic>fatigue</italic> and <italic>infection</italic>, <italic>brain fog</italic> and <italic>fatigue</italic>, and <italic>depression</italic> and <italic>anxiety</italic> co-occur more commonly than other terms; for Reddit, common symptom and condition pairs include <italic>fatigue</italic> and <italic>bradycardia</italic>, <italic>fatigue</italic> and <italic>anxiety</italic>, and <italic>fatigue</italic> and <italic>short term memory loss</italic>.</p>
        <fig id="figure3" position="float">
          <label>Figure 3</label>
          <caption>
            <p>Co-occurrence frequency of normalized post–COVID-19 condition terms in Twitter (A) which is higher than 50% and Reddit (B) which is higher than 10% data. Higher values are shown by the intensity of pink and blue shading. Normalized terms are the raw terms that were normalized (after a 2-step normalization process, as shown in <xref rid="figure1" ref-type="fig">Figure 1</xref>) to the 203 standardized unique concepts derived from a web-based survey of 3762 patients with post–COVID-19 condition [<xref ref-type="bibr" rid="ref3">3</xref>]. For instance, “my tiredness” is normalized into “fatigue”. Please see <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref> for a larger version.</p>
          </caption>
          <graphic xlink:href="jmir_v25i1e45767_fig3.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Spatio-Temporal Frequency Estimation</title>
        <p>The distribution of normalized symptom and condition terms (standardized per month) over time is shown in <xref rid="figure4" ref-type="fig">Figure 4</xref> for Twitter and Reddit data. The incidence of the neuropsychiatric symptom and condition terms is dominant, followed by the systemic category, across both social media platforms. On a more granular level, <italic>fatigue</italic>, <italic>anxiety</italic>, and <italic>infections</italic> were the most prevalent terms reported. Our findings indicate that the predominance of terms varied over time, where for Twitter, <italic>anxiety</italic> was dominant through June 2020, and afterward, <italic>fatigue</italic> was the most commonly reported symptom. <italic>Infection</italic> has been reported by users as a persistent condition for the entire period. On Reddit, for most periods, <italic>fatigue</italic> and <italic>infection</italic> were more dominant than <italic>anxiety</italic>.</p>
        <p>Based on our spatial analysis performed on Twitter data, among all the normalized symptom and condition terms (26,247 terms aggregated across all tweets, as shown in <xref ref-type="table" rid="table3">Table 3</xref>), 41% (n=10,878) included location information. The 41% (n=10,878) were spread across 62 countries, whereby the United States (n=4850, 15%), United Kingdom (n=4316, 13%), and Canada (n=631, 2%) were the top 3 for self-reporting of symptoms related to PCC (full details are listed in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). The other 59 countries reported fewer than 1% (n&lt;251) of the symptom and condition terms and were excluded from our analysis. <xref rid="figure5" ref-type="fig">Figure 5</xref> indicates the proportional contribution of the top 4 reporting countries to the total occurrence frequency of symptom and condition terms normalized to unique concepts in KB and grouped by the affected organ system.</p>
        <fig id="figure4" position="float">
          <label>Figure 4</label>
          <caption>
            <p>The distribution rate of normalized and grouped post–COVID-19 condition terms over time; the rates are standardized per month.</p>
          </caption>
          <graphic xlink:href="jmir_v25i1e45767_fig4.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <fig id="figure5" position="float">
          <label>Figure 5</label>
          <caption>
            <p>The proportional contribution (n=10,878, 41%) of the top 4 countries (the United States, United Kingdom, Canada, and Australia) to each group's occurrence frequency of symptom and condition terms. The proportions are measured as a percentage of frequency group-related terms per each country group divided by the total count of terms in that group. HEENT: head, eyes, ears, nose, and throat.</p>
          </caption>
          <graphic xlink:href="jmir_v25i1e45767_fig5.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
    </sec>
    <sec sec-type="discussion">
      <title>Discussion</title>
      <sec>
        <title>Overview</title>
        <p>The overarching goal of this study was to highlight the possibility of gaining insight into the patient’s experience of PCC using social media and NLP approaches. User-generated social media data provide a rich yet challenging source of information about the patient's journey outside the health care setting. Significant limitations remain concerning recognizing a patient’s lived experience instead of opinion and aligning common vernacular to recognized medical terminology. However, our study has made progress toward narrowing the data quality gap and evaluating the validity and reliability of social media-driven outcomes regarding PCC using state-of-the-art NLP approaches. Our results suggest that there is value in the learnings and methodologies outlined in this study to gather insights from patient-reported outcomes on social media platforms.</p>
      </sec>
      <sec>
        <title>Principal Findings</title>
        <p>Our transformer-based entity extraction tool, clinical UmlsBERT, outperformed Stanza and UMLS MetaMap (+AMIA) to extract symptom and condition terms from both Twitter and Reddit. Our results confirm the previous view that augmenting contextual embeddings with expert domains from a knowledge base outperforms domain-specific models on common named-entity recognition inside and outside health care settings [<xref ref-type="bibr" rid="ref25">25</xref>,<xref ref-type="bibr" rid="ref38">38</xref>]. The outcome of our NLP pipeline, mapped and normalized symptom and condition terms, was comparable with the outcomes of peer-reviewed papers relevant to PCC symptoms [<xref ref-type="bibr" rid="ref3">3</xref>,<xref ref-type="bibr" rid="ref35">35</xref>,<xref ref-type="bibr" rid="ref36">36</xref>,<xref ref-type="bibr" rid="ref39">39</xref>]. Our analysis confirms prior findings that PCC is a multisystemic condition affecting multiple organ systems. Our study showed that <italic>fatigue</italic>, <italic>brain fog</italic>, <italic>anxiety</italic>, and <italic>shortness of breath</italic> are the most commonly occurring groups of terms for PCC symptoms on Twitter and Reddit. This aligns with the primary discoveries of recent studies [<xref ref-type="bibr" rid="ref3">3</xref>,<xref ref-type="bibr" rid="ref35">35</xref>,<xref ref-type="bibr" rid="ref36">36</xref>,<xref ref-type="bibr" rid="ref39">39</xref>], where the top 3 most debilitating symptoms listed by patients were fatigue, breathing issues, and cognitive impairment.</p>
        <p>Our findings support the increasing interest in and confirm the need for supplementing clinical observation with user-generated data [<xref ref-type="bibr" rid="ref40">40</xref>]. In this study, we reported the frequency of co-occurring symptom and condition terms. Based on our results, the pair of <italic>fatigue</italic> and <italic>headaches</italic> was among the most co-occurring terms on both social media platforms; to the best of our knowledge, expressing a combination of symptoms related to PCC has been rarely reported in the literature, certainly not from social media based studies. One instance of clinically derived analysis of symptom co-occurrence comes from a US-based retrospective cohort study [<xref ref-type="bibr" rid="ref36">36</xref>] evaluating long-term symptoms in COVID-19 survivors, where they similarly observe that fatigue tends to co-occur frequently with abnormal breathing or shortness of breath in patients with PCC. The potential value of this analysis is in connecting symptom and condition terms early in illness onset with prognostic factors, such as the likelihood of developing PCC, and the severity or duration of the PCC condition.</p>
        <p>Our study and the survey study by Davis et al [<xref ref-type="bibr" rid="ref3">3</xref>] share another point of agreement: uncovering symptoms that are not commonly mentioned in public discussion of PCC, which we call novel symptoms and conditions. Our study revealed that users experienced unique symptoms like infection, hair loss, and weight loss, as well as reported conditions that resembled those of other illnesses such as flu, cancer, or Lyme disease.</p>
        <p>One strength of the NLP pipeline developed in this study is its scalability, leading to high-level adaptive capacity for other social media platforms or different medical conditions. Our NLP pipeline is well-poised to scale the information extraction process from user-generated data and connect vernacular to recognized medical ontology. In addition, it enables exploring the longitudinal evolution of symptoms, which may be correlated with prevalent SARS-CoV-2 variants at the time of onset to guide insights into variation in disease course associated with different source viral strains.</p>
      </sec>
      <sec>
        <title>Limitations</title>
        <p>A common issue with automating entity extraction and normalization is losing the context behind the extracted terms (in the form of independent tokens or words). Without context, it may be difficult to interpret the meaning behind these tokens. For example, the token <italic>death</italic> could be extracted from a personal opinion, an expression of a feeling, or a factual statement; an example of a personal statement is “I think COVID-19 increases the likelihood of death,” whereas an example of a factual statement is “My cousin died of COVID-19.” In addition, an example of symptom expression tied to the condition is “This long COVID feels like death.” Furthermore, clustering semantically similar extracted symptoms and bringing them closer to predefined standards and common medical terms increases the risk of losing context.</p>
        <p>Another limitation of this study relates to our self-report filter. While RegExs are effective in finding posts containing first-person self-reports, they are also prone to false positives—for example, mistakenly keeping a post that voices an opinion or reports symptoms on behalf of someone else—and false negatives—for example, mistakenly discarding a tweet that excludes first-person pronouns but would still qualify as a self-report. We, therefore, considered other context-based approaches, including a fine-tuned BERT classifier. These approaches may reduce false positive and false negative rates; however, further work is needed to annotate sufficient data sets for manually fine-tuning, sweeping, and optimizing hyperparameters.</p>
        <p>The nature of social media data and our NLP pipeline introduces bias to our findings, which could impact the reliability of our outcomes. Social media content is unlikely to represent the broader population due to demographic biases in technology uptake, barriers to access, and regional social media platform preferences. Furthermore, only a subset of users are patients who publicly share their experiences with PCC. Users’ self-reports may also be influenced by prominent opinions reinforced on the internet through news media or other social media “influencers,” which adds to sample bias. In addition to inherent bias, our analysis is further biased by including only English-language posts.</p>
        <p>In social media, data quality is highly variable due to the use of colloquial language (eg, “I am dying”), brevity or shorthand, and grammatical and spelling errors. Tweets were first collected based on the presence of relevant hashtags and keywords (eg, <italic>long COVID</italic>). While this approach successfully surfaced many relevant tweets, both false positives (eg, mistakenly keeping a tweet that refers to <italic>long haul</italic> in the context of transportation) and false negatives (eg, mistakenly discarding a relevant tweet because it lacks or has a typo in a target hashtag or keyword) can occur. False positives could be reduced by checking for the context surrounding a match, for example, excluding tweets that refer to long-haul flights. False negatives could be reduced using advanced semantic language models beyond keyword matching, for example, classifying clinical tweets versus those that are not.</p>
        <p>Additionally, concerning the data quality, this study lacks the ability to verify the genuineness and authenticity of users’ posts on social media. Ensuring the authenticity of users is a crucial aspect of using social media to provide insights into public health. Some potential strategies to improve the quality of social media content and exclude fake accounts and bots include implementing account verification processes, monitoring user behavior for suspicious activity, and using available ML models built to identify fraudulent activities.</p>
      </sec>
      <sec>
        <title>Best Practices and Future Directions</title>
        <sec>
          <title>Overview</title>
          <p>In this section, we would like to explain the best practices for researchers and developers interested in applying NLP to social media to facilitate information extraction tasks.</p>
        </sec>
        <sec>
          <title>Generalizable Models</title>
          <p>UmlsBERT is a transformer-based contextual model with better generalizability and reliability than traditional entity extraction models. However, in our experiments we found that these models may overfit to occurrences of specific tokens such as <italic>vid</italic>. As a result, at inference time, incorrect tokens may be captured. A simple solution to extend the capabilities of these models is by looking at the frequencies of captured tokens and devising simple rules to correct the errors. A simple rule-based strategy should also remove variations of the same frequently occurring tokens. Another potential solution for this task would be using symptom ontology and standard normalization procedures to facilitate comparisons between variations of the same token. Literature references and ground truths can also be referred to for this procedure, especially in the case of new illnesses wherein ontologies may not fully capture the experiences that patients are trying to express—for example, brain fog.</p>
        </sec>
        <sec>
          <title>Longitudinal Analysis</title>
          <p>Providing a longitudinal analysis of posts from the same user will enable better characterization of the evolution of symptoms over time. However, it is essential to note that this might pose challenges for ensuring privacy as, for example, the combination of posts may increase the possibility of reidentifying a user. In addition, aggregating their posts may infer illness in patients who have not consented to such an assessment.</p>
        </sec>
      </sec>
      <sec>
        <title>Conclusions</title>
        <p>In this study, we successfully used transformer-based BERT models to extract and normalize PCC symptom and condition terms from social media platforms. We evaluated the effectiveness, validity, and reliability of NLP models through comparison with a human-annotated corpus and a web-based survey with more than 3000 participants from several countries. In summary, the outcome of the NLP models aligned and complemented the previous research regarding the occurrence and co-occurrence frequency of PCC-related symptom and condition terms. In conclusion, our findings support that social media can augment health care research by providing insights into diseases that are captured outside usual episodes of clinical care. Moreover, it promotes pandemic advance monitoring and response by enhancing the scope of information-feeding risk models. Significant challenges remain for improving the accuracy and context with which symptoms are recognized (from vernacular) and interpreted (to medical ontology), which, if resolved, would add to the overall use of the process.</p>
      </sec>
    </sec>
  </body>
  <back>
    <app-group>
      <supplementary-material id="app1">
        <label>Multimedia Appendix 1</label>
        <p>Table S1: Twitter hashtags list and Table S2: Number of tweets per country list.</p>
        <media xlink:href="jmir_v25i1e45767_app1.docx" xlink:title="DOCX File , 26 KB"/>
      </supplementary-material>
      <supplementary-material id="app2">
        <label>Multimedia Appendix 2</label>
        <p>Co-occurrence frequency of normalized post–COVID-19 condition terms in Twitter (A) which is higher than 50% and Reddit (B) which is higher than 10% data. Higher values are shown by the intensity of pink and blue shading. Normalized terms are the raw terms that were normalized (after a 2-step normalization process, as shown in Figure 1) to the 203 standardized unique concepts derived from a web-based survey of 3762 patients with post–COVID-19 condition [<xref ref-type="bibr" rid="ref3">3</xref>]. For instance, “my tiredness” is normalized into “fatigue”.</p>
        <media xlink:href="jmir_v25i1e45767_app2.png" xlink:title="PNG File , 3180 KB"/>
      </supplementary-material>
    </app-group>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">AMIA</term>
          <def>
            <p>American Medical Informatics Association</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb2">API</term>
          <def>
            <p>application programming interface</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb3">BERT</term>
          <def>
            <p>bidirectional encoder representations from transformers</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb4">NER</term>
          <def>
            <p>named entity recognition</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb5">NLP</term>
          <def>
            <p>natural language processing</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb6">RegEx</term>
          <def>
            <p>regular expression</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <ack>
      <p>The participating companies and medical subject matter experts deserve special credit. Together, over the course of the project Deloitte, Roche, and TELUS provided clinical and machine learning expertise that enabled the pipeline to function. Key contributions by these Vector sponsor companies include original project ideation; review of clinical literature on PCC; data collection, cleaning, and annotation; filter implementation; normalization; named entity recognition modeling; visualization engineering; and interpretation of results. We also thank the contributions of Jennifer Camaradou as a patient advocate for her insights into PCC and the potential benefits from this body of work.</p>
    </ack>
    <notes>
      <sec>
        <title>Data Availability</title>
        <p>The data sets generated and analyzed during this study are available in our Github repository [<xref ref-type="bibr" rid="ref41">41</xref>].</p>
      </sec>
    </notes>
    <fn-group>
      <fn fn-type="conflict">
        <p>None declared.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Deer</surname>
              <given-names>RR</given-names>
            </name>
            <name name-style="western">
              <surname>Rock</surname>
              <given-names>MA</given-names>
            </name>
            <name name-style="western">
              <surname>Vasilevsky</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Carmody</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Rando</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Anzalone</surname>
              <given-names>AJ</given-names>
            </name>
            <name name-style="western">
              <surname>Basson</surname>
              <given-names>MD</given-names>
            </name>
            <name name-style="western">
              <surname>Bennett</surname>
              <given-names>TD</given-names>
            </name>
            <name name-style="western">
              <surname>Bergquist</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Boudreau</surname>
              <given-names>EA</given-names>
            </name>
            <name name-style="western">
              <surname>Bramante</surname>
              <given-names>CT</given-names>
            </name>
            <name name-style="western">
              <surname>Byrd</surname>
              <given-names>JB</given-names>
            </name>
            <name name-style="western">
              <surname>Callahan</surname>
              <given-names>TJ</given-names>
            </name>
            <name name-style="western">
              <surname>Chan</surname>
              <given-names>LE</given-names>
            </name>
            <name name-style="western">
              <surname>Chu</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Chute</surname>
              <given-names>CG</given-names>
            </name>
            <name name-style="western">
              <surname>Coleman</surname>
              <given-names>BD</given-names>
            </name>
            <name name-style="western">
              <surname>Davis</surname>
              <given-names>HE</given-names>
            </name>
            <name name-style="western">
              <surname>Gagnier</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Greene</surname>
              <given-names>CS</given-names>
            </name>
            <name name-style="western">
              <surname>Hillegass</surname>
              <given-names>WB</given-names>
            </name>
            <name name-style="western">
              <surname>Kavuluru</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Kimble</surname>
              <given-names>WD</given-names>
            </name>
            <name name-style="western">
              <surname>Koraishy</surname>
              <given-names>FM</given-names>
            </name>
            <name name-style="western">
              <surname>Köhler</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Liang</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Madhira</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Madlock-Brown</surname>
              <given-names>CR</given-names>
            </name>
            <name name-style="western">
              <surname>Matentzoglu</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Mazzotti</surname>
              <given-names>DR</given-names>
            </name>
            <name name-style="western">
              <surname>McMurry</surname>
              <given-names>JA</given-names>
            </name>
            <name name-style="western">
              <surname>McNair</surname>
              <given-names>DS</given-names>
            </name>
            <name name-style="western">
              <surname>Moffitt</surname>
              <given-names>RA</given-names>
            </name>
            <name name-style="western">
              <surname>Monteith</surname>
              <given-names>TS</given-names>
            </name>
            <name name-style="western">
              <surname>Parker</surname>
              <given-names>AM</given-names>
            </name>
            <name name-style="western">
              <surname>Perry</surname>
              <given-names>MA</given-names>
            </name>
            <name name-style="western">
              <surname>Pfaff</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Reese</surname>
              <given-names>JT</given-names>
            </name>
            <name name-style="western">
              <surname>Saltz</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Schuff</surname>
              <given-names>RA</given-names>
            </name>
            <name name-style="western">
              <surname>Solomonides</surname>
              <given-names>AE</given-names>
            </name>
            <name name-style="western">
              <surname>Solway</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Spratt</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Stein</surname>
              <given-names>GS</given-names>
            </name>
            <name name-style="western">
              <surname>Sule</surname>
              <given-names>AA</given-names>
            </name>
            <name name-style="western">
              <surname>Topaloglu</surname>
              <given-names>U</given-names>
            </name>
            <name name-style="western">
              <surname>Vavougios</surname>
              <given-names>GD</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Haendel</surname>
              <given-names>MA</given-names>
            </name>
            <name name-style="western">
              <surname>Robinson</surname>
              <given-names>PN</given-names>
            </name>
          </person-group>
          <article-title>Characterizing long COVID: deep phenotype of a complex condition</article-title>
          <source>eBioMedicine</source>
          <year>2021</year>
          <volume>74</volume>
          <fpage>103722</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.thelancet.com/journals/ebiom/article/PIIS2352-3964(21)00516-8/fulltext"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.ebiom.2021.103722</pub-id>
          <pub-id pub-id-type="medline">34839263</pub-id>
          <pub-id pub-id-type="pii">S2352-3964(21)00516-8</pub-id>
          <pub-id pub-id-type="pmcid">PMC8613500</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Domingo</surname>
              <given-names>FR</given-names>
            </name>
            <name name-style="western">
              <surname>Waddell</surname>
              <given-names>LA</given-names>
            </name>
            <name name-style="western">
              <surname>Cheung</surname>
              <given-names>AM</given-names>
            </name>
            <name name-style="western">
              <surname>Cooper</surname>
              <given-names>CL</given-names>
            </name>
            <name name-style="western">
              <surname>Belcourt</surname>
              <given-names>VJ</given-names>
            </name>
            <name name-style="western">
              <surname>Zuckermann</surname>
              <given-names>AM</given-names>
            </name>
            <name name-style="western">
              <surname>Corrin</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Ahmad</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Boland</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Laprise</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Idzerda</surname>
              <given-names>L</given-names>
            </name>
          </person-group>
          <article-title>Prevalence of long-term effects in individuals diagnosed with COVID-19: an updated living systematic review</article-title>
          <source>bioRxiv, medRxiv</source>
          <year>2021</year>
          <fpage>1</fpage>
          <lpage>59</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.medrxiv.org/content/10.1101/2021.06.03.21258317v2"/>
          </comment>
          <pub-id pub-id-type="doi">10.1101/2021.06.03.21258317</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Davis</surname>
              <given-names>HE</given-names>
            </name>
            <name name-style="western">
              <surname>Assaf</surname>
              <given-names>GS</given-names>
            </name>
            <name name-style="western">
              <surname>McCorkell</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Wei</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Low</surname>
              <given-names>RJ</given-names>
            </name>
            <name name-style="western">
              <surname>Re'em</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Redfield</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Austin</surname>
              <given-names>JP</given-names>
            </name>
            <name name-style="western">
              <surname>Akrami</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Characterizing long COVID in an international cohort: 7 months of symptoms and their impact</article-title>
          <source>eClinicalMedicine</source>
          <year>2021</year>
          <volume>38</volume>
          <fpage>101019</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.thelancet.com/journals/eclinm/article/PIIS2589-5370(21)00299-6/fulltext"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.eclinm.2021.101019</pub-id>
          <pub-id pub-id-type="medline">34308300</pub-id>
          <pub-id pub-id-type="pii">S2589-5370(21)00299-6</pub-id>
          <pub-id pub-id-type="pmcid">PMC8280690</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Mahase</surname>
              <given-names>E</given-names>
            </name>
          </person-group>
          <article-title>Covid-19: what do we know about "long covid"?</article-title>
          <source>BMJ</source>
          <year>2020</year>
          <volume>370</volume>
          <fpage>m2815</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.bmj.com/content/370/bmj.m2815"/>
          </comment>
          <pub-id pub-id-type="doi">10.1136/bmj.m2815</pub-id>
          <pub-id pub-id-type="medline">32665317</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Chakraborty</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Johnson</surname>
              <given-names>JN</given-names>
            </name>
            <name name-style="western">
              <surname>Spagnoli</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Amin</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Mccoy</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Swaminathan</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Yohannan</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Philip</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>Long-term cardiovascular outcomes of multisystem inflammatory syndrome in children associated with COVID-19 using an institution based algorithm</article-title>
          <source>Pediatr Cardiol</source>
          <year>2023</year>
          <volume>44</volume>
          <issue>2</issue>
          <fpage>367</fpage>
          <lpage>380</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://link.springer.com/article/10.1007/s00246-022-03020-w"/>
          </comment>
          <pub-id pub-id-type="doi">10.1007/s00246-022-03020-w</pub-id>
          <pub-id pub-id-type="medline">36214896</pub-id>
          <pub-id pub-id-type="pii">10.1007/s00246-022-03020-w</pub-id>
          <pub-id pub-id-type="pmcid">PMC9549828</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref6">
        <label>6</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Daugherty</surname>
              <given-names>SE</given-names>
            </name>
            <name name-style="western">
              <surname>Guo</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Heath</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Dasmariñas</surname>
              <given-names>MC</given-names>
            </name>
            <name name-style="western">
              <surname>Jubilo</surname>
              <given-names>KG</given-names>
            </name>
            <name name-style="western">
              <surname>Samranvedhya</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Lipsitch</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Cohen</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>Risk of clinical sequelae after the acute phase of SARS-CoV-2 infection: retrospective cohort study</article-title>
          <source>BMJ</source>
          <year>2021</year>
          <volume>373</volume>
          <fpage>n1098</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.bmj.com/content/373/bmj.n1098"/>
          </comment>
          <pub-id pub-id-type="doi">10.1136/bmj.n1098</pub-id>
          <pub-id pub-id-type="medline">34011492</pub-id>
          <pub-id pub-id-type="pmcid">PMC8132065</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref7">
        <label>7</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ayoubkhani</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Khunti</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Nafilyan</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Maddox</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Humberstone</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Diamond</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Banerjee</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Post-COVID syndrome in individuals admitted to hospital with COVID-19: retrospective cohort study</article-title>
          <source>BMJ</source>
          <year>2021</year>
          <volume>372</volume>
          <fpage>n693</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.bmj.com/content/372/bmj.n693"/>
          </comment>
          <pub-id pub-id-type="doi">10.1136/bmj.n693</pub-id>
          <pub-id pub-id-type="medline">33789877</pub-id>
          <pub-id pub-id-type="pmcid">PMC8010267</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref8">
        <label>8</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Brownstein</surname>
              <given-names>JS</given-names>
            </name>
            <name name-style="western">
              <surname>Freifeld</surname>
              <given-names>CC</given-names>
            </name>
            <name name-style="western">
              <surname>Madoff</surname>
              <given-names>LC</given-names>
            </name>
          </person-group>
          <article-title>Digital disease detection--harnessing the web for public health surveillance</article-title>
          <source>N Engl J Med</source>
          <year>2009</year>
          <volume>360</volume>
          <issue>21</issue>
          <fpage>2153</fpage>
          <lpage>2155</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/19423867"/>
          </comment>
          <pub-id pub-id-type="doi">10.1056/NEJMp0900702</pub-id>
          <pub-id pub-id-type="medline">19423867</pub-id>
          <pub-id pub-id-type="pii">NEJMp0900702</pub-id>
          <pub-id pub-id-type="pmcid">PMC2917042</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref9">
        <label>9</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Eysenbach</surname>
              <given-names>G</given-names>
            </name>
          </person-group>
          <article-title>Infodemiology and infoveillance: framework for an emerging set of public health informatics methods to analyze search, communication and publication behavior on the internet</article-title>
          <source>J Med Internet Res</source>
          <year>2009</year>
          <volume>11</volume>
          <issue>1</issue>
          <fpage>e11</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.jmir.org/2009/1/e11"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/jmir.1157</pub-id>
          <pub-id pub-id-type="medline">19329408</pub-id>
          <pub-id pub-id-type="pii">v11i1e11</pub-id>
          <pub-id pub-id-type="pmcid">PMC2762766</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref10">
        <label>10</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Salathé</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Digital epidemiology: what is it, and where is it going?</article-title>
          <source>Life Sci Soc Policy</source>
          <year>2018</year>
          <volume>14</volume>
          <issue>1</issue>
          <fpage>1</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://lsspjournal.biomedcentral.com/articles/10.1186/s40504-017-0065-7"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/s40504-017-0065-7</pub-id>
          <pub-id pub-id-type="medline">29302758</pub-id>
          <pub-id pub-id-type="pii">10.1186/s40504-017-0065-7</pub-id>
          <pub-id pub-id-type="pmcid">PMC5754279</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref11">
        <label>11</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Thackeray</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Neiger</surname>
              <given-names>BL</given-names>
            </name>
            <name name-style="western">
              <surname>Smith</surname>
              <given-names>AK</given-names>
            </name>
            <name name-style="western">
              <surname>Van Wagenen</surname>
              <given-names>SB</given-names>
            </name>
          </person-group>
          <article-title>Adoption and use of social media among public health departments</article-title>
          <source>BMC Public Health</source>
          <year>2012</year>
          <volume>12</volume>
          <fpage>242</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://bmcpublichealth.biomedcentral.com/articles/10.1186/1471-2458-12-242"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/1471-2458-12-242</pub-id>
          <pub-id pub-id-type="medline">22449137</pub-id>
          <pub-id pub-id-type="pii">1471-2458-12-242</pub-id>
          <pub-id pub-id-type="pmcid">PMC3331826</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref12">
        <label>12</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Dixon</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <source>Number of social media users worldwide from 2018 to 2022, with forecasts from 2023 to 2027</source>
          <access-date>2023-07-13</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.statista.com/statistics/278414/number-of-worldwide-social-network-users/">https://www.statista.com/statistics/278414/number-of-worldwide-social-network-users/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref13">
        <label>13</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Li</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Jin</surname>
              <given-names>XR</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Pender</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Song</surname>
              <given-names>CP</given-names>
            </name>
            <name name-style="western">
              <surname>Tang</surname>
              <given-names>SL</given-names>
            </name>
            <name name-style="western">
              <surname>Cao</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>YG</given-names>
            </name>
          </person-group>
          <article-title>Experiences and challenges in the health protection of medical teams in the Chinese ebola treatment center, liberia: a qualitative study</article-title>
          <source>Infect Dis Poverty</source>
          <year>2018</year>
          <volume>7</volume>
          <issue>1</issue>
          <fpage>92</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://idpjournal.biomedcentral.com/articles/10.1186/s40249-018-0468-6"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/s40249-018-0468-6</pub-id>
          <pub-id pub-id-type="medline">30134982</pub-id>
          <pub-id pub-id-type="pii">10.1186/s40249-018-0468-6</pub-id>
          <pub-id pub-id-type="pmcid">PMC6103862</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref14">
        <label>14</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Masri</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Jia</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Zhou</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>MC</given-names>
            </name>
            <name name-style="western">
              <surname>Yan</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Use of Twitter data to improve Zika virus surveillance in the United States during the 2016 epidemic</article-title>
          <source>BMC Public Health</source>
          <year>2019</year>
          <volume>19</volume>
          <issue>1</issue>
          <fpage>761</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://bmcpublichealth.biomedcentral.com/articles/10.1186/s12889-019-7103-8"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/s12889-019-7103-8</pub-id>
          <pub-id pub-id-type="medline">31200692</pub-id>
          <pub-id pub-id-type="pii">10.1186/s12889-019-7103-8</pub-id>
          <pub-id pub-id-type="pmcid">PMC6570872</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref15">
        <label>15</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Tao</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Hu</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Rundensteiner</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Feng</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>Crowdsourcing and machine learning approaches for extracting entities indicating potential foodborne outbreaks from social media</article-title>
          <source>Sci Rep</source>
          <year>2021</year>
          <volume>11</volume>
          <issue>1</issue>
          <fpage>21678</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.nature.com/articles/s41598-021-00766-w"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/s41598-021-00766-w</pub-id>
          <pub-id pub-id-type="medline">34737325</pub-id>
          <pub-id pub-id-type="pii">10.1038/s41598-021-00766-w</pub-id>
          <pub-id pub-id-type="pmcid">PMC8568976</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref16">
        <label>16</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Paul</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Dredze</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>You are what you tweet: analyzing Twitter for public health</article-title>
          <source>ICWSM</source>
          <year>2011</year>
          <volume>5</volume>
          <issue>1</issue>
          <fpage>265</fpage>
          <lpage>272</lpage>
          <pub-id pub-id-type="doi">10.1609/icwsm.v5i1.14137</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref17">
        <label>17</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Tao</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Feng</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>Utilization of text mining as a big data analysis tool for food science and nutrition</article-title>
          <source>Compr Rev Food Sci Food Saf</source>
          <year>2020</year>
          <volume>19</volume>
          <issue>2</issue>
          <fpage>875</fpage>
          <lpage>894</lpage>
          <pub-id pub-id-type="doi">10.1111/1541-4337.12540</pub-id>
          <pub-id pub-id-type="medline">33325182</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref18">
        <label>18</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Park</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Conway</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Tracking health related discussions on reddit for public health applications</article-title>
          <source>AMIA Annu Symp Proc</source>
          <year>2017</year>
          <volume>2017</volume>
          <fpage>1362</fpage>
          <lpage>1371</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/29854205"/>
          </comment>
          <pub-id pub-id-type="medline">29854205</pub-id>
          <pub-id pub-id-type="pmcid">PMC5977623</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref19">
        <label>19</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Mackey</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Purushothaman</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Shah</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Nali</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Bardier</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Liang</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Cai</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Cuomo</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>Machine learning to detect self-reporting of symptoms, testing access, and recovery associated with COVID-19 on Twitter: retrospective big data infoveillance study</article-title>
          <source>JMIR Public Health Surveill</source>
          <year>2020</year>
          <volume>6</volume>
          <issue>2</issue>
          <fpage>e19509</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://publichealth.jmir.org/2020/2/e19509/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/19509</pub-id>
          <pub-id pub-id-type="medline">32490846</pub-id>
          <pub-id pub-id-type="pii">v6i2e19509</pub-id>
          <pub-id pub-id-type="pmcid">PMC7282475</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref20">
        <label>20</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Guo</surname>
              <given-names>JW</given-names>
            </name>
            <name name-style="western">
              <surname>Radloff</surname>
              <given-names>CL</given-names>
            </name>
            <name name-style="western">
              <surname>Wawrzynski</surname>
              <given-names>SE</given-names>
            </name>
            <name name-style="western">
              <surname>Cloyes</surname>
              <given-names>KG</given-names>
            </name>
          </person-group>
          <article-title>Mining Twitter to explore the emergence of COVID-19 symptoms</article-title>
          <source>Public Health Nurs</source>
          <year>2020</year>
          <volume>37</volume>
          <issue>6</issue>
          <fpage>934</fpage>
          <lpage>940</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://onlinelibrary.wiley.com/doi/10.1111/phn.12809"/>
          </comment>
          <pub-id pub-id-type="doi">10.1111/phn.12809</pub-id>
          <pub-id pub-id-type="medline">32937679</pub-id>
          <pub-id pub-id-type="pmcid">PMC8080690</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref21">
        <label>21</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Jelodar</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Orji</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Huang</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Deep sentiment classification and topic discovery on Novel Coronavirus or COVID-19 online discussions: NLP using LSTM recurrent neural network approach</article-title>
          <source>IEEE J Biomed Health Inform</source>
          <year>2020</year>
          <volume>24</volume>
          <issue>10</issue>
          <fpage>2733</fpage>
          <lpage>2742</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://ieeexplore.ieee.org/document/9112671"/>
          </comment>
          <pub-id pub-id-type="doi">10.1109/JBHI.2020.3001216</pub-id>
          <pub-id pub-id-type="medline">32750931</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref22">
        <label>22</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Qin</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Sun</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>KF</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Shia</surname>
              <given-names>BC</given-names>
            </name>
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>SY</given-names>
            </name>
          </person-group>
          <article-title>Prediction of number of cases of 2019 novel Coronavirus (COVID-19) using social media search index</article-title>
          <source>Int J Environ Res Public Health</source>
          <year>2020</year>
          <volume>17</volume>
          <issue>7</issue>
          <fpage>2365</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.mdpi.com/1660-4601/17/7/2365"/>
          </comment>
          <pub-id pub-id-type="doi">10.3390/ijerph17072365</pub-id>
          <pub-id pub-id-type="medline">32244425</pub-id>
          <pub-id pub-id-type="pii">ijerph17072365</pub-id>
          <pub-id pub-id-type="pmcid">PMC7177617</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref23">
        <label>23</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Low</surname>
              <given-names>DM</given-names>
            </name>
            <name name-style="western">
              <surname>Rumker</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Talkar</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Torous</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Cecchi</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Ghosh</surname>
              <given-names>SS</given-names>
            </name>
          </person-group>
          <article-title>Natural language processing reveals vulnerable mental health support groups and heightened health anxiety on reddit during COVID-19: observational study</article-title>
          <source>J Med Internet Res</source>
          <year>2020</year>
          <volume>22</volume>
          <issue>10</issue>
          <fpage>e22635</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.jmir.org/2020/10/e22635/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/22635</pub-id>
          <pub-id pub-id-type="medline">32936777</pub-id>
          <pub-id pub-id-type="pii">v22i10e22635</pub-id>
          <pub-id pub-id-type="pmcid">PMC7575341</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref24">
        <label>24</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Müller</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Salathé</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Kummervold</surname>
              <given-names>PE</given-names>
            </name>
          </person-group>
          <article-title>COVID-Twitter-BERT: a natural language processing model to analyse COVID-19 content on Twitter</article-title>
          <source>ArXiv Preprint posted online on 15 May 2020</source>
          <pub-id pub-id-type="doi">10.3389/frai.2023.1023281</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref25">
        <label>25</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Michalopoulos</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Kaka</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Wong</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>UmlsBERT: clinical domain knowledge augmentation of contextual embeddings using the unified medical language system metathesaurus</article-title>
          <source>ArXiv Preprint posted online on 03 Jun 2021</source>
          <pub-id pub-id-type="doi">10.18653/v1/2021.naacl-main.139</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref26">
        <label>26</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Rahman</surname>
              <given-names>MM</given-names>
            </name>
            <name name-style="western">
              <surname>Khatun</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Uzzaman</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Sami</surname>
              <given-names>SI</given-names>
            </name>
            <name name-style="western">
              <surname>Bhuiyan</surname>
              <given-names>MAA</given-names>
            </name>
            <name name-style="western">
              <surname>Kiong</surname>
              <given-names>TS</given-names>
            </name>
          </person-group>
          <article-title>A comprehensive study of artificial intelligence and machine learning approaches in confronting the coronavirus (COVID-19) pandemic</article-title>
          <source>Int J Health Serv</source>
          <year>2021</year>
          <volume>51</volume>
          <issue>4</issue>
          <fpage>446</fpage>
          <lpage>461</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://journals.sagepub.com/doi/10.1177/00207314211017469"/>
          </comment>
          <pub-id pub-id-type="doi">10.1177/00207314211017469</pub-id>
          <pub-id pub-id-type="medline">33999732</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref27">
        <label>27</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Klein</surname>
              <given-names>AZ</given-names>
            </name>
            <name name-style="western">
              <surname>Magge</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>O'Connor</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Amaro</surname>
              <given-names>JIF</given-names>
            </name>
            <name name-style="western">
              <surname>Weissenbacher</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Hernandez</surname>
              <given-names>GG</given-names>
            </name>
          </person-group>
          <article-title>Toward using Twitter for tracking COVID-19: a natural language processing pipeline and exploratory data set</article-title>
          <source>J Med Internet Res</source>
          <year>2021</year>
          <volume>23</volume>
          <issue>1</issue>
          <fpage>e25314</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.jmir.org/2021/1/e25314/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/25314</pub-id>
          <pub-id pub-id-type="medline">33449904</pub-id>
          <pub-id pub-id-type="pii">v23i1e25314</pub-id>
          <pub-id pub-id-type="pmcid">PMC7834613</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref28">
        <label>28</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Serrano</surname>
              <given-names>JCM</given-names>
            </name>
            <name name-style="western">
              <surname>Papakyriakopoulos</surname>
              <given-names>O</given-names>
            </name>
            <name name-style="western">
              <surname>Hegelich</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>NLP-based feature extraction for the detection of COVID-19 misinformation videos on YouTube</article-title>
          <source>Proceedings of the 1st Workshop on NLP for COVID-19 at ACL 2020</source>
          <year>2020</year>
          <conf-name>NLP-based Feature Extraction for the Detection of COVID-19 Misinformation Videos on YouTube</conf-name>
          <conf-date>July 2020</conf-date>
          <conf-loc>Virtual</conf-loc>
          <publisher-name>Association for Computational Linguistics</publisher-name>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://aclanthology.org/2020.nlpcovid19-acl.17/"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref29">
        <label>29</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Malden</surname>
              <given-names>DE</given-names>
            </name>
            <name name-style="western">
              <surname>Tartof</surname>
              <given-names>SY</given-names>
            </name>
            <name name-style="western">
              <surname>Ackerson</surname>
              <given-names>BK</given-names>
            </name>
            <name name-style="western">
              <surname>Hong</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Skarbinski</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Yau</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Qian</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Fischer</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Shaw</surname>
              <given-names>SF</given-names>
            </name>
            <name name-style="western">
              <surname>Caparosa</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Xie</surname>
              <given-names>F</given-names>
            </name>
          </person-group>
          <article-title>Natural language processing for improved characterization of COVID-19 symptoms: observational study of 350,000 patients in a large integrated health care system</article-title>
          <source>JMIR Public Health Surveill</source>
          <year>2022</year>
          <volume>8</volume>
          <issue>12</issue>
          <fpage>e41529</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://publichealth.jmir.org/2022/12/e41529"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/41529</pub-id>
          <pub-id pub-id-type="medline">36446133</pub-id>
          <pub-id pub-id-type="pii">v8i12e41529</pub-id>
          <pub-id pub-id-type="pmcid">PMC9822566</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref30">
        <label>30</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>LL</given-names>
            </name>
            <name name-style="western">
              <surname>Lo</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>Text mining approaches for dealing with the rapidly expanding literature on COVID-19</article-title>
          <source>Brief Bioinform</source>
          <year>2021</year>
          <volume>22</volume>
          <issue>2</issue>
          <fpage>781</fpage>
          <lpage>799</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://academic.oup.com/bib/article/22/2/781/6024738?login=false"/>
          </comment>
          <pub-id pub-id-type="doi">10.1093/bib/bbaa296</pub-id>
          <pub-id pub-id-type="medline">33279995</pub-id>
          <pub-id pub-id-type="pii">6024738</pub-id>
          <pub-id pub-id-type="pmcid">PMC7799291</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref31">
        <label>31</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kulev</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Köprü</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Rodriguez-Esteban</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Saldana</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Huang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>La Torraca</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Ozkirimli</surname>
              <given-names>E</given-names>
            </name>
          </person-group>
          <article-title>Extraction of medication names from Twitter using augmentation and an ensemble of language models</article-title>
          <source>ArXiv Preprint posted online on 12 Nov 2021</source>
        </nlm-citation>
      </ref>
      <ref id="ref32">
        <label>32</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Bhambhoria</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Saab</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Uppal</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Yakimovich</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Bhatti</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Valdamudi</surname>
              <given-names>NK</given-names>
            </name>
            <name name-style="western">
              <surname>Bales</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Dolatabadi</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Kocak</surname>
              <given-names>SA</given-names>
            </name>
          </person-group>
          <person-group person-group-type="editor">
            <name name-style="western">
              <surname>Shaban-Nejad</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Michalowski</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Bianco</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Towards providing clinical insights on long covid from Twitter data</article-title>
          <source>Multimodal AI in Healthcare: A Paradigm Shift in Health Intelligence</source>
          <year>2023</year>
          <publisher-loc>Cham, Switzerland</publisher-loc>
          <publisher-name>Springer</publisher-name>
          <fpage>267</fpage>
          <lpage>278</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref33">
        <label>33</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Clemens</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>GEOProcessing</article-title>
          <source>Geocoding with openstreetmap data</source>
          <year>2015</year>
          <access-date>2023-07-13</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.researchgate.net/profile/Bruno-M-Meneses/publication/280575974_Water_Quality_Impact_Assessment_of_Land_Use_and_Land_Cover_Changes_A_dynamic_IT_model_for_territorial_integrated_management/links/55bb739208aed621de0d9692/Water-Quality-Impact-Assessment-of-Land-Use-and-Land-Cover-Changes-A-dynamic-IT-model-for-territorial-integrated-management.pdf#page=11">https://www.researchgate.net/profile/Bruno-M-Meneses/publication/280575974_Water_Quality_Impact_Assessment_of_Land_Use_and_Land_Cover_Changes_​A_dynamic_IT_model_for_territorial_integrated_management/links/55bb739208aed621de0d9692/Water-Quality-​Impact-Assessment-of-Land-Use-and-Land-Cover-Changes-A-dynamic-IT-model-for-territorial-integrated-management.pdf#page=11</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref34">
        <label>34</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Reimers</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Gurevych</surname>
              <given-names>I</given-names>
            </name>
          </person-group>
          <article-title>Sentence-BERT: sentence embeddings using siamese BERT-networks</article-title>
          <source>ArXiv Preprint posted online on 27 Aug 2019</source>
          <pub-id pub-id-type="doi">10.18653/v1/d19-1410</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref35">
        <label>35</label>
        <nlm-citation citation-type="web">
          <source>Long Covid: A Systematic Review and Meta-Analysis of 120,970 Patients</source>
          <access-date>2023-07-13</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://papers.ssrn.com/sol3/papers.cfm?abstract_id=4099429">https://papers.ssrn.com/sol3/papers.cfm?abstract_id=4099429</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref36">
        <label>36</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Taquet</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Dercon</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Luciano</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Geddes</surname>
              <given-names>JR</given-names>
            </name>
            <name name-style="western">
              <surname>Husain</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Harrison</surname>
              <given-names>PJ</given-names>
            </name>
          </person-group>
          <article-title>Incidence, co-occurrence, and evolution of long-COVID features: a 6-month retrospective cohort study of 273,618 survivors of COVID-19</article-title>
          <source>PLoS Med</source>
          <year>2021</year>
          <volume>18</volume>
          <issue>9</issue>
          <fpage>e1003773</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://journals.plos.org/plosmedicine/article?id=10.1371/journal.pmed.1003773"/>
          </comment>
          <pub-id pub-id-type="doi">10.1371/journal.pmed.1003773</pub-id>
          <pub-id pub-id-type="medline">34582441</pub-id>
          <pub-id pub-id-type="pii">PMEDICINE-D-21-02226</pub-id>
          <pub-id pub-id-type="pmcid">PMC8478214</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref37">
        <label>37</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Williams</surname>
              <given-names>ML</given-names>
            </name>
            <name name-style="western">
              <surname>Burnap</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Sloan</surname>
              <given-names>L</given-names>
            </name>
          </person-group>
          <article-title>Towards an ethical framework for publishing Twitter data in social research: taking into account users' views, online context and algorithmic estimation</article-title>
          <source>Sociology</source>
          <year>2017</year>
          <volume>51</volume>
          <issue>6</issue>
          <fpage>1149</fpage>
          <lpage>1168</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://journals.sagepub.com/doi/10.1177/0038038517708140"/>
          </comment>
          <pub-id pub-id-type="doi">10.1177/0038038517708140</pub-id>
          <pub-id pub-id-type="medline">29276313</pub-id>
          <pub-id pub-id-type="pii">10.1177_0038038517708140</pub-id>
          <pub-id pub-id-type="pmcid">PMC5718335</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref38">
        <label>38</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Yuan</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Tan</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Huang</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Huang</surname>
              <given-names>F</given-names>
            </name>
          </person-group>
          <article-title>Improving biomedical pretrained language models with knowledge</article-title>
          <source>ArXiv Preprint posted online on 21 Apr 2021</source>
          <pub-id pub-id-type="doi">10.18653/v1/2021.bionlp-1.20</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref39">
        <label>39</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Davis</surname>
              <given-names>HE</given-names>
            </name>
            <name name-style="western">
              <surname>McCorkell</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Vogel</surname>
              <given-names>JM</given-names>
            </name>
            <name name-style="western">
              <surname>Topol</surname>
              <given-names>EJ</given-names>
            </name>
          </person-group>
          <article-title>Long COVID: major findings, mechanisms and recommendations</article-title>
          <source>Nat Rev Microbiol</source>
          <year>2023</year>
          <volume>21</volume>
          <issue>3</issue>
          <fpage>133</fpage>
          <lpage>146</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.nature.com/articles/s41579-022-00846-2"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/s41579-022-00846-2</pub-id>
          <pub-id pub-id-type="medline">36639608</pub-id>
          <pub-id pub-id-type="pii">10.1038/s41579-022-00846-2</pub-id>
          <pub-id pub-id-type="pmcid">PMC9839201</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref40">
        <label>40</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Chunara</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Smolinski</surname>
              <given-names>MS</given-names>
            </name>
            <name name-style="western">
              <surname>Brownstein</surname>
              <given-names>JS</given-names>
            </name>
          </person-group>
          <article-title>Why we need crowdsourced data in infectious disease surveillance</article-title>
          <source>Curr Infect Dis Rep</source>
          <year>2013</year>
          <volume>15</volume>
          <issue>4</issue>
          <fpage>316</fpage>
          <lpage>319</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/23689991"/>
          </comment>
          <pub-id pub-id-type="doi">10.1007/s11908-013-0341-5</pub-id>
          <pub-id pub-id-type="medline">23689991</pub-id>
          <pub-id pub-id-type="pmcid">PMC3718458</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref41">
        <label>41</label>
        <nlm-citation citation-type="web">
          <source>VectorInstitute/ProjectLongCovid-NER</source>
          <access-date>2023-08-08</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://github.com/VectorInstitute/ProjectLongCovid-NER">https://github.com/VectorInstitute/ProjectLongCovid-NER</ext-link>
          </comment>
        </nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
