<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article" dtd-version="2.0">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JMIR</journal-id>
      <journal-id journal-id-type="nlm-ta">J Med Internet Res</journal-id>
      <journal-title>Journal of Medical Internet Research</journal-title>
      <issn pub-type="epub">1438-8871</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
      <article-id pub-id-type="publisher-id">v24i1e28749</article-id>
      <article-id pub-id-type="pmid">35040794</article-id>
      <article-id pub-id-type="doi">10.2196/28749</article-id>
      <article-categories>
        <subj-group subj-group-type="heading">
          <subject>Original Paper</subject>
        </subj-group>
        <subj-group subj-group-type="article-type">
          <subject>Original Paper</subject>
        </subj-group>
      </article-categories>
      <title-group>
        <article-title>Crowdsourcing for Machine Learning in Public Health Surveillance: Lessons Learned From Amazon Mechanical Turk</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="editor">
          <name>
            <surname>Kukafka</surname>
            <given-names>Rita</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Xu</surname>
            <given-names>Ronghua</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Das</surname>
            <given-names>Anik</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib id="contrib1" contrib-type="author">
          <name name-style="western">
            <surname>Shakeri Hossein Abad</surname>
            <given-names>Zahra</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <xref rid="aff2" ref-type="aff">2</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-4519-864X</ext-link>
        </contrib>
        <contrib id="contrib2" contrib-type="author">
          <name name-style="western">
            <surname>Butler</surname>
            <given-names>Gregory P</given-names>
          </name>
          <degrees>MSc</degrees>
          <xref rid="aff3" ref-type="aff">3</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-7536-2044</ext-link>
        </contrib>
        <contrib id="contrib3" contrib-type="author">
          <name name-style="western">
            <surname>Thompson</surname>
            <given-names>Wendy</given-names>
          </name>
          <degrees>MSc</degrees>
          <xref rid="aff3" ref-type="aff">3</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-0177-998X</ext-link>
        </contrib>
        <contrib id="contrib4" contrib-type="author" corresp="yes">
          <name name-style="western">
            <surname>Lee</surname>
            <given-names>Joon</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff2" ref-type="aff">2</xref>
          <address>
            <institution>Data Intelligence for Health Lab</institution>
            <institution>Cumming School of Medicine</institution>
            <institution>University of Calgary</institution>
            <addr-line>3280 Hospital Dr NW</addr-line>
            <addr-line>Calgary, AB, T2N 4Z6</addr-line>
            <country>Canada</country>
            <phone>1 403 220 2968</phone>
            <email>joonwu.lee@ucalgary.ca</email>
          </address>
          <xref rid="aff4" ref-type="aff">4</xref>
          <xref rid="aff5" ref-type="aff">5</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-8593-9321</ext-link>
        </contrib>
      </contrib-group>
      <aff id="aff1">
        <label>1</label>
        <institution>Department of Biomedical Informatics</institution>
        <institution>Harvard Medical School</institution>
        <institution>Harvard University</institution>
        <addr-line>Boston, MA</addr-line>
        <country>United States</country>
      </aff>
      <aff id="aff2">
        <label>2</label>
        <institution>Data Intelligence for Health Lab</institution>
        <institution>Cumming School of Medicine</institution>
        <institution>University of Calgary</institution>
        <addr-line>Calgary, AB</addr-line>
        <country>Canada</country>
      </aff>
      <aff id="aff3">
        <label>3</label>
        <institution>Centre for Surveillance and Applied Research</institution>
        <institution>Public Health Agency of Canada</institution>
        <addr-line>Ottawa, ON</addr-line>
        <country>Canada</country>
      </aff>
      <aff id="aff4">
        <label>4</label>
        <institution>Department of Community Health Sciences</institution>
        <institution>Cumming School of Medicine</institution>
        <institution>University of Calgary</institution>
        <addr-line>Calgary, AB</addr-line>
        <country>Canada</country>
      </aff>
      <aff id="aff5">
        <label>5</label>
        <institution>Department of Cardiac Sciences</institution>
        <institution>Cumming School of Medicine</institution>
        <institution>University of Calgary</institution>
        <addr-line>Calgary, AB</addr-line>
        <country>Canada</country>
      </aff>
      <author-notes>
        <corresp>Corresponding Author: Joon Lee <email>joonwu.lee@ucalgary.ca</email></corresp>
      </author-notes>
      <pub-date pub-type="collection">
        <month>1</month>
        <year>2022</year>
      </pub-date>
      <pub-date pub-type="epub">
        <day>18</day>
        <month>1</month>
        <year>2022</year>
      </pub-date>
      <volume>24</volume>
      <issue>1</issue>
      <elocation-id>e28749</elocation-id>
      <history>
        <date date-type="received">
          <day>13</day>
          <month>3</month>
          <year>2021</year>
        </date>
        <date date-type="rev-request">
          <day>12</day>
          <month>6</month>
          <year>2021</year>
        </date>
        <date date-type="rev-recd">
          <day>5</day>
          <month>7</month>
          <year>2021</year>
        </date>
        <date date-type="accepted">
          <day>15</day>
          <month>11</month>
          <year>2021</year>
        </date>
      </history>
      <copyright-statement>©Zahra Shakeri Hossein Abad, Gregory P Butler, Wendy Thompson, Joon Lee. Originally published in the Journal of Medical Internet Research (https://www.jmir.org), 18.01.2022.</copyright-statement>
      <copyright-year>2022</copyright-year>
      <license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
        <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (https://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in the Journal of Medical Internet Research, is properly cited. The complete bibliographic information, a link to the original publication on https://www.jmir.org/, as well as this copyright and license information must be included.</p>
      </license>
      <self-uri xlink:href="https://www.jmir.org/2022/1/e28749" xlink:type="simple"/>
      <abstract>
        <sec sec-type="background">
          <title>Background</title>
          <p>Crowdsourcing services, such as Amazon Mechanical Turk (AMT), allow researchers to use the collective intelligence of a wide range of web users for labor-intensive tasks. As the manual verification of the quality of the collected results is difficult because of the large volume of data and the quick turnaround time of the process, many questions remain to be explored regarding the reliability of these resources for developing digital public health systems.</p>
        </sec>
        <sec sec-type="objective">
          <title>Objective</title>
          <p>This study aims to explore and evaluate the application of crowdsourcing, generally, and AMT, specifically, for developing digital public health surveillance systems.</p>
        </sec>
        <sec sec-type="methods">
          <title>Methods</title>
          <p>We collected 296,166 crowd-generated labels for 98,722 tweets, labeled by 610 AMT workers, to develop machine learning (ML) models for detecting behaviors related to physical activity, sedentary behavior, and sleep quality among Twitter users. To infer the ground truth labels and explore the quality of these labels, we studied 4 statistical consensus methods that are agnostic of task features and only focus on worker labeling behavior. Moreover, to model the meta-information associated with each labeling task and leverage the potential of context-sensitive data in the truth inference process, we developed 7 ML models, including traditional classifiers (offline and active), a deep learning–based classification model, and a hybrid convolutional neural network model.</p>
        </sec>
        <sec sec-type="results">
          <title>Results</title>
          <p>Although most crowdsourcing-based studies in public health have often equated majority vote with quality, the results of our study using a truth set of 9000 manually labeled tweets showed that consensus-based inference models mask underlying uncertainty in data and overlook the importance of task meta-information. Our evaluations across 3 physical activity, sedentary behavior, and sleep quality data sets showed that truth inference is a context-sensitive process, and none of the methods studied in this paper were consistently superior to others in predicting the truth label. We also found that the performance of the ML models trained on crowd-labeled data was sensitive to the quality of these labels, and poor-quality labels led to incorrect assessment of these models. Finally, we have provided a set of practical recommendations to improve the quality and reliability of crowdsourced data.</p>
        </sec>
        <sec sec-type="conclusions">
          <title>Conclusions</title>
          <p>Our findings indicate the importance of the quality of crowd-generated labels in developing ML models designed for decision-making purposes, such as public health surveillance decisions. A combination of inference models outlined and analyzed in this study could be used to quantitatively measure and improve the quality of crowd-generated labels for training ML models.</p>
        </sec>
      </abstract>
      <kwd-group>
        <kwd>crowdsourcing</kwd>
        <kwd>machine learning</kwd>
        <kwd>digital public health surveillance</kwd>
        <kwd>public health database</kwd>
        <kwd>social media analysis</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <sec sec-type="introduction">
      <title>Introduction</title>
      <sec>
        <title>Background</title>
        <p>In recent years, social media data have been extensively used in different areas of public health [<xref ref-type="bibr" rid="ref1">1</xref>-<xref ref-type="bibr" rid="ref3">3</xref>], such as detecting outbreaks and emerging diseases [<xref ref-type="bibr" rid="ref4">4</xref>,<xref ref-type="bibr" rid="ref5">5</xref>], monitoring adverse drug reactions [<xref ref-type="bibr" rid="ref6">6</xref>], and predicting or modeling health-related behaviors and outcomes [<xref ref-type="bibr" rid="ref7">7</xref>-<xref ref-type="bibr" rid="ref9">9</xref>]. Since 2011, Twitter has been the most popular form of social media used for public health communication [<xref ref-type="bibr" rid="ref10">10</xref>,<xref ref-type="bibr" rid="ref11">11</xref>]. In 2020, Twitter alone reported 500 million tweets generated per day from 145 million daily active users. A recent scoping review of 755 articles on digital public health surveillance shows that Twitter is the most studied of all platforms and the most used platform to study communicable diseases, behavioral risk factors, mental health, drug use, and vaccines [<xref ref-type="bibr" rid="ref11">11</xref>]. In addition to the inherent limitations of social media data, such as lack of demographic data and biased populations, when integrated with complex data-driven models such as artificial neural networks, these publicly accessible resources can be used for population-level surveillance to complement traditional public health surveillance (eg, surveys) with faster and less costly longitudinal information.</p>
        <p>Although linguistic annotation is crucial for developing machine learning (ML) and natural language processing (NLP) models, manual labeling of a large volume of data is a notorious problem because of its high cost and labor-intensive nature. In recent years, this problem has been tackled using crowdsourcing technologies such as Amazon Mechanical Turk (AMT) [<xref ref-type="bibr" rid="ref12">12</xref>], Crowdflower [<xref ref-type="bibr" rid="ref13">13</xref>], and Prolific Academic [<xref ref-type="bibr" rid="ref13">13</xref>] to obtain relatively low-cost labeled data more quickly and easily. AMT is a software service operated by Amazon that allows users to crowdsource work, broken into microtasks called HITs (Human Intelligence Tasks), to a large number of workers who are compensated for each HIT completed. With the vast potential applications of crowdsourcing in public health [<xref ref-type="bibr" rid="ref14">14</xref>-<xref ref-type="bibr" rid="ref16">16</xref>], the research community has seen steady growth in the use of AMT in the past 10 years. The number of studies indexed in PubMed using the search term <italic>Amazon Mechanical Turk</italic> AND <italic>public health</italic> has increased sharply from 42 studies in 2015 to 118 studies in 2019.</p>
        <p>However, because of the uncertain quality of AMT workers with unknown expertise, their labels are sometimes unreliable, forcing researchers and practitioners to collect information redundantly, which poses new challenges in the field. Given that in large-scale crowdsourcing tasks the same workers cannot label all the examples, measuring interannotator agreement and managing the quality of workers differ from those of a team of in-house expert workers. Despite the growing popularity of AMT for developing ML models in public health research, the reliability and validity of this service have not yet been investigated. At least several public health studies have used AMT for training data-driven ML models without external gold standard comparisons [<xref ref-type="bibr" rid="ref17">17</xref>-<xref ref-type="bibr" rid="ref21">21</xref>]. Ayers et al [<xref ref-type="bibr" rid="ref17">17</xref>] used AMT to create a gold standard data set to develop predictive models to detect electronic nicotine delivery systems on social media. Yin et al [<xref ref-type="bibr" rid="ref18">18</xref>] developed a scalable classifier to detect personal health mentions on Twitter based on a gold standard data set generated by AMT workers. The reliability of the crowd-labeled data set in this study was measured based on the agreement among workers.</p>
        <p>Similarly, to characterize sleep quality using Twitter, McIver et al [<xref ref-type="bibr" rid="ref19">19</xref>] used AMT for sentiment annotation of text data and used interannotator agreement to assess the reliability of workers. Reece et al [<xref ref-type="bibr" rid="ref20">20</xref>] used AMT to build a data set and develop a prediction model to detect depression emergence and posttraumatic stress disorder in Twitter users. To control the quality of the data collected, they required the workers to have completed at least hundred tasks, with a minimum 95% approval rating. Although research has supported the efficacy of using reputation to evaluate the quality of crowdsourced data [<xref ref-type="bibr" rid="ref22">22</xref>], the reliability of using this metric in developing ML-based digital public health systems has not yet been investigated. Thus, in this study, in addition to defining qualification requirements for AMT workers, we studied the reliability of crowd-generated training data for developing ML models in the context of public health surveillance. We used AMT to collect 296,166 labels for 98,722 unique tweets, labeled by 610 AMT workers, to develop ML models that can detect the physical activity, sedentary behavior, and sleep quality (PASS) of Twitter users.</p>
      </sec>
      <sec>
        <title>Objectives</title>
        <p>The primary aim of this study is to evaluate the application of AMT for training data-driven ML models by analyzing the quality of crowd-generated labels. As the quality of crowd-generated labels, regardless of the type of the task being studied, is critical to the robustness of ML models trained based on these labels, we created a gold standard data set of labels and applied several statistical and ML-based models to assess the reliability of using the crowd-labeling task from different perspectives (eg, process, design, and inference). To interpret the results of our quality assessment and explore the effect of noisy labels on the applicability of inference models in dealing with these labels, our approach involved evaluating the performance of 4 consensus methods, which do not involve task features in their truth inference, and exploring their feasibility in improving the quality of crowd-labeled data. As these methods are modeled purely as a function of worker behaviors concerning labeling tasks, they cannot leverage the value of context-sensitive information (ie, the task’s meta-information) in their inference decisions. Thus, we collected additional features for our labeling data set and developed 7 ML models, including a deep learning (DL) model and a hybrid convolutional neural network (CNN) architecture to couple worker behaviors with the task’s meta-information when inferring the truth label. To detect and correct noisy labels, we also developed 5 pool-based active learners to iteratively detect the most informative samples (ie, samples with more uncertainty) and remove them from the validation set. Finally, we used SHAP (Shapley Additive Explanations) [<xref ref-type="bibr" rid="ref23">23</xref>] to explore the contribution of different features, including worker behaviors and context-sensitive features, to the results of our supervised inference models.</p>
      </sec>
    </sec>
    <sec sec-type="methods">
      <title>Methods</title>
      <sec>
        <title>Labels</title>
        <p>The crowdsourcing tasks, referred to as HITs by AMT, were designed to collect 5 labels based on 2 conditions, self-reported and recent PASS experience, to develop binary and multiclass classification models that can detect PASS-related behavior in Twitter users. The labels of the multiclass prediction models were defined as 11, 10, 01, and 00, based on the value of each condition (Figure S1 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). We also let workers choose a fifth option, called <italic>unclear</italic>, to ensure they did not give random labels to tasks they were not confident in performing successfully (<xref rid="figure1" ref-type="fig">Figure 1</xref>). We excluded this label for both inference and classification tasks. We defined the binary labels as 1 if both conditions were met and 0, otherwise. The binary labels did not directly come from the AMT workers and were generated by dichotomizing the collected labels.</p>
        <fig id="figure1" position="float">
          <label>Figure 1</label>
          <caption>
            <p>A sample labeling task (ie, human intelligence task [HIT]) for sedentary behavior. Each HIT contains 4 questions (section 1), and each asks if the presented tweet is a self-reported physical activity, sedentary behavior, or sleep quality–related behavior (section 2). The fourth question is an easy, qualification question that was used to check the quality of the worker (section 3).</p>
          </caption>
          <graphic xlink:href="jmir_v24i1e28749_fig1.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Crowdsourcing Workflow</title>
        <p>We implemented a pipeline to create the HITs, post them on AMT, collect the labels through a quality check process, approve or reject the HITs, and store the results. To minimize noisy and low-quality data, we added a qualification requirement to our tasks and granted labeling access to workers who had demonstrated a high degree of success in performing a wide range of HITs across AMT (ie, master qualification). In addition, we added a simple qualification question to each HIT to detect spammers or irresponsible workers. Each HIT contained 4 questions, including the qualification question, and was assigned to 3 workers (<xref rid="figure1" ref-type="fig">Figure 1</xref> and Figures S2 and S3 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). Workers were asked to select exactly 1 choice per tweet, and HITs with zero or more than one label were rejected during the approval process. Through different iterations of data labeling, workers were paid from US $0.03 to US $0.05 after completing each HIT. We collected the labels for the 98,722 tweets used in this study through different iterations, from April 2019 to June 2020. We regularly checked the quality of the submitted tasks to detect low-quality workers during each iteration and revoke their access to our tasks. Before the formal initiation of the process, we pilot-tested the design, response time, and complexity of the HITs through 2 different iterations and revised the workflow accordingly. We did not collect any personally identifiable information from the workers (participants) during the data labeling task. The experiments were carried out in accordance with the relevant guidelines and the University of Calgary Conjoint Faculties Research Ethics Board regulations. We implemented the entire workflow in Python and used Boto3 Python software development kit to connect to and work with AMT.</p>
      </sec>
      <sec>
        <title>Data Collection</title>
        <p>We collected data for this study from Twitter using the Twitter livestream application programming interface (API) for the period between November 28, 2018, and June 30, 2020. The data set was filtered to include only Canadian tweets relevant to PASS. A total of 103,911 tweets were selected from 22,729,110 Canadian tweets using keywords and regular expressions related to PASS categories. Each of these 103,911 tweets was labeled by 3 AMT workers, from which 98,722 tweets received 3 valid labels, with almost half of them related to physical activity.</p>
        <p>The demographic variables of age and gender and the information about the source of each tweet (eg, organization vs real users) were not available within the data set collected from Twitter. We estimated these variables for each tweet using the M3 inference package in Python [<xref ref-type="bibr" rid="ref24">24</xref>], which uses a multimodal deep neural architecture for the joint classification of age, gender, and information sources of social media data. The text (tweet) field and each of the daytime, weekday, and month variables were extracted from the metadata provided by the Twitter API.</p>
        <p>We have made the Twitter data set used in this study publicly available [<xref ref-type="bibr" rid="ref25">25</xref>].</p>
      </sec>
      <sec>
        <title>Data Processing</title>
        <p>Tweets have a bounding box of coordinates, which enables spatial mapping to their respective city locations. As the Twitter API returns datetime values in Coordinated Universal Time, we used a time zone finder in Python and adjusted the time of each tweet based on its spatial data. Given that daytime, month, and weekday can be influential factors in twitting about each of the PASS categories, and to better use the datetime data (%a %b %d %H: %M: %S %Y), we extracted a: weekday, b: month, and H: hour fields and stored them as separate features.</p>
        <p>We cleaned the text column by eliminating all special characters (eg, #, &#38;, and @), punctuations, weblinks, and numbers. We also replaced common contractions with their uncontracted forms; for example, <italic>I’ll</italic> was resolved as <italic>I will</italic>. While developing and evaluating our NLP models, we noticed that the impact of removing stop words, stemming, and converting the text to lower case on the performance of our predictive models was not noticeable. This could relate to the ability of transfer-learning techniques (ie, GloVe embeddings) to generalize on unseen data. Thus, we applied neither stop-word removing nor lexical cleaning on the textual features of our data set. Moreover, as hashtags and emojis can be used as independent words and facilitate emotional expressions, we did not remove them during the cleaning process.</p>
        <p>To develop the ML models, all categorical data were encoded into dummy variables using one-hot encoding, and as we only approved HITs with complete answers, this data set did not contain any missing data.</p>
      </sec>
      <sec>
        <title>Label Consistency</title>
        <p>To measure the consistency of answers given by the workers, we calculated label consistency (LC) as the average entropy of the collected labels for each PASS category [<xref ref-type="bibr" rid="ref26">26</xref>]. For each tweet <italic>t<sub>i</sub></italic> ∈ <italic>T<sub>s</sub></italic>, where <italic>T<sub>s</sub></italic> denotes the set of all tweets related to surveillance category <italic>s</italic> ∈ {physical activity, sleep quality, sedentary behavior}, <italic>n<sub>ij</sub></italic> defines the number of answers given to the <italic>j<sup>th</sup></italic> choice (<italic>j</italic> ∈ {1,2,3,4,5}, as we had 5 choices for each tweet). We calculated <italic>LC<sub>s</sub></italic> as follows:</p>
        <disp-formula>
          <graphic xlink:href="jmir_v24i1e28749_fig6.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </disp-formula>
        <p>&#124;s&#124; denotes the size of the surveillance category <italic>s</italic> and, as we collected 3 labels for each tweet, the denominators in the entropy formula received a constant value of 3. <italic>LC</italic> ranges from 0 to 1, and values close to 1 show more consistency among the workers’ input.</p>
      </sec>
      <sec>
        <title>Ground Truth Data Set</title>
        <p>To investigate the viability of unsupervised inference models in predicting truth labels from crowd-labeled data and compare it with that of supervised predictive models, we used a random sample of our data set as a ground truth set (ie, 9000 tweets: 4000 tweets for physical activity, 3000 tweets for sleep quality, and 2000 tweets for sedentary behavior). In total, 6 data scientists manually labeled this sample, and the entire labeled data set was reviewed manually and relabeled by an experienced in-house domain expert in both ML and public health surveillance. The disagreements between this data set and the crowd-labeled data set were manually checked to exclude any labeling bias that could impact the results of this study.</p>
      </sec>
      <sec>
        <title>Inference Models</title>
        <p>The majority voting (MV) approach estimates the actual ground truth based on most labels submitted by different workers. For example, defining the estimated label as <inline-graphic xlink:href="jmir_v24i1e28749_fig14.png" xlink:type="simple" mimetype="image"/>, and the submitted label by worker <italic>w</italic> as <italic>l<sub>w</sub></italic>, the MV approach, for a binary labeling task, assigns 1 to <inline-graphic xlink:href="jmir_v24i1e28749_fig14.png" xlink:type="simple" mimetype="image"/> if <inline-graphic xlink:href="jmir_v24i1e28749_fig7.png" xlink:type="simple" mimetype="image"/> and 0, otherwise. Although individual workers’ reliability coming from different backgrounds with different quality levels varies, the MV approach assumes equal expertise among the workers and does not model worker behaviors [<xref ref-type="bibr" rid="ref27">27</xref>]. As this approach is completely task-independent, it does not involve task properties in the inference process; thus, it is fast.</p>
        <p>The David and Skene (DS) [<xref ref-type="bibr" rid="ref28">28</xref>] approach uses expectation–maximization (EM) to simultaneously estimate the error rate of annotators (workers) and latent label classes, when, similar to MV, the ground truth is unknown, and workers are assumed to operate independently. Unlike MV, which is agnostic to worker behavior, DS models worker k’s behavior as a function of each task’s true label by creating a confusion matrix <italic>π<sup>k</sup></italic> with size <italic>L</italic> × <italic>L</italic>, where <italic>L</italic> is a fixed number and represents the number of possible labels for a single-labeled classification task. DS defines worker <italic>k</italic>’s error rate <inline-graphic xlink:href="jmir_v24i1e28749_fig8.png" xlink:type="simple" mimetype="image"/> as follows:</p>
        <disp-formula>
          <graphic xlink:href="jmir_v24i1e28749_fig9.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </disp-formula>
        <p>As not all workers need to label all the tasks, and a worker may label the same task more than once, sparsity can be a problem in large-scale labeling tasks when using the DS approach [<xref ref-type="bibr" rid="ref27">27</xref>]. DS iteratively estimates the true label of each task based on the worker’s quality and estimates the worker’s error rate (quality) based on the inferred labels until it converges. Although the worker-specific confusion matrix generates the quality score of each worker, it may not be sufficient to measure the actual contribution of each worker [<xref ref-type="bibr" rid="ref29">29</xref>]. The inherent complexity of a task, especially in NLP, or a worker’s bias may result in wrong labels, although the worker is quantitatively accurate.</p>
        <p>The generative model of labels, abilities, and difficulties (GLAD) [<xref ref-type="bibr" rid="ref30">30</xref>] models the quality of workers as a function of the input task using parameter <italic>α</italic>. The quality parameter ranges from –∞ to +∞, implying that the worker always labels the tasks incorrectly or correctly, respectively. When <italic>α</italic>=0, the worker cannot distinguish among the labels, and their input does not contribute to the task’s correct label. To estimate the ground truth, in addition to the workers’ quality, GLAD models the difficulty of task <italic>t<sub>i</sub></italic> as <italic>d<sub>i</sub></italic>=1/<italic>β<sub>i</sub></italic>, where <italic>β<sub>i</sub></italic>&#62;0. The difficulty index ranges from 0 to ∞, where <italic>d<sub>i</sub></italic>=∞ classifies <italic>t<sub>i</sub></italic> as the most difficult task, and <italic>d<sub>i</sub></italic>=0 means that the task always receives a correct label, even from the workers with <italic>α</italic>≤0. GLAD uses the EM approach to obtain the maximum likelihood estimation of <italic>α</italic> and <italic>β</italic>, and models the probability that worker <italic>k</italic> correctly labels <italic>t<sub>i</sub></italic> using <inline-graphic xlink:href="jmir_v24i1e28749_fig10.png" xlink:type="simple" mimetype="image"/>.</p>
        <p>Similar to DS, Raykar algorithm (RY) [<xref ref-type="bibr" rid="ref31">31</xref>] forms a confusion matrix to model a worker’s quality. In addition, in the case of binary classification, it models worker’s bias toward the positive class (ie, sensitivity) and toward the negative class (ie, specificity) using beta prior [<xref ref-type="bibr" rid="ref27">27</xref>]. Worker bias in this context usually occurs when a worker underestimates or overestimates the truth of a task [<xref ref-type="bibr" rid="ref26">26</xref>]. As with DS and GLAD, RY uses an unsupervised EM approach to estimate each of the model parameters and truth labels. Depending on the availability of task-specific features, RY can either use automatic supervised classifiers or fall back to unsupervised EM models to estimate the truth label.</p>
      </sec>
      <sec>
        <title>Predictive Models</title>
        <p>As the meta-information associated with each task may reveal its underlying complexity and thus help model worker behaviors, we developed a set of ML models to involve this metadata in the inference process. Models were trained based on quintuple <italic>F</italic>: (<italic>W</italic>,<italic>I</italic>,<italic>M</italic>,<italic>t</italic>,<italic>l</italic>), where <italic>W</italic> = {<italic>w<sub>1</sub></italic>,...,<italic>w<sub>k</sub></italic>} represents labels collected from AMT workers, <italic>I</italic> = {<italic>MV</italic>,<italic>DS</italic>,<italic>RY</italic>,<italic>GLAD</italic>} denotes the results of inference models, and <italic>M</italic> denotes metadata associated with each tweet including time (ie, weekday, month, and daytime), gender, age group, and the source of the tweet (ie, organization vs real people). The text of each tweet is presented by <italic>t</italic>, and <italic>l</italic> denotes the truth label.</p>
        <p>To mitigate the risk of biased results caused by a specific learning algorithm and overcome the overfitting problem, we developed and evaluated 5 standard ML classifiers with different architectures, including generalized linear (logistic regression [LR]), kernel-based (support vector machines [SVM]), decision-tree–based (random forest and XGBoost), and sample-based (K-nearest neighbors [KNN]) classifiers. Moreover, to incorporate textual features into our analysis, we developed a hybrid DL architecture in which a CNN based on long short-term memory (LSTM) learns textual data <italic>t</italic> and a multilayer perceptron deep neural network learns metadata <italic>(W,J,M)</italic>. The cleaned text, represented as an integer-encoded vector, is converted into pretrained tweet word embeddings using GloVe [<xref ref-type="bibr" rid="ref32">32</xref>] (containing 2 billion tweets, 27 billion tokens, and 1.2 million vocabularies) in the embedding layer. The output of this layer is passed through an LSTM layer for sequence modeling, followed by 1 dropout layer to avoid overfitting and 2 dense ReLU (Rectified Linear Unit) layers. Simultaneously, the metadata of each tweet is passed through 3 fully connected layers with ReLU activation. The outputs of these networks are concatenated into a dense layer, followed by 2 fully connected dense layers, terminating at an output layer with softmax activation, cross-entropy loss, and the adam optimizer. A high-level presentation of this architecture is shown in <xref rid="figure2" ref-type="fig">Figure 2</xref>.</p>
        <fig id="figure2" position="float">
          <label>Figure 2</label>
          <caption>
            <p>The pipeline of the deep learning model used to predict labels using both textual information and meta-information. LSTM: long short-term memory.</p>
          </caption>
          <graphic xlink:href="jmir_v24i1e28749_fig2.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <p>To counter the bias caused by class imbalance, for both multiclass and binary classification tasks, we used the class-weight approach to incorporate the weight of each class into the cost function by assigning higher weights to minority classes and lower weights to the majority classes. We also used the SMOTE (Synthetic Minority Oversampling Technique-Nominal Continuous) [<xref ref-type="bibr" rid="ref33">33</xref>] approach to oversample the minority classes by creating synthetic samples based on their feature space. However, we did not notice much difference between using and not using the synthetic minority oversampling technique. Thus, our final models were trained using the class-weight approach. The hyperparameters for each method were determined using a nested 10-fold cross-validation Bayesian optimization [<xref ref-type="bibr" rid="ref34">34</xref>].</p>
        <p>As the main goal of both supervised and unsupervised label inference models was to minimize the number of false-negative and false-positive inferences, to evaluate the models developed in this study, we used precision, recall, F1, and precision-recall area under the curve (AUC<sub>PR</sub>) metrics.</p>
        <p>All the computations and predictive models were implemented using Python 3.7 with TensorFlow 2.0 [<xref ref-type="bibr" rid="ref35">35</xref>], Keras [<xref ref-type="bibr" rid="ref36">36</xref>], and Scikit-learn [<xref ref-type="bibr" rid="ref37">37</xref>] libraries. To facilitate the replication of our study, the code repository of this study is publicly available on GitHub [<xref ref-type="bibr" rid="ref38">38</xref>].</p>
      </sec>
    </sec>
    <sec sec-type="results">
      <title>Results</title>
      <sec>
        <title>Raw Labels From AMT Workers</title>
        <p>In total, 610 unique workers participated in our data labeling tasks and completed 103,911 HITs, from which 5189 HITs were removed as they did not receive 3 valid answers. We approved 98,722 tasks for further analysis. Most workers (530/610, 86.9%) completed &#60;100 HITs, of which 164 completed only 1 HIT. Among the workers who completed &#62;5000 HITs, 1 worker completed 21,801 HITs and 3 workers completed between 5000 and 10,000 HITs (<xref rid="figure3" ref-type="fig">Figure 3</xref>). The calculated <italic>LC</italic> for each PASS category for multiclass labeling was 0.54, 0.58, and 0.55 and for binary labeling was 0.75, 0.77, and 0.74 (<xref ref-type="table" rid="table1">Table 1</xref>). This implies a high level of label inconsistency, prompting the need for further label quality analysis for the development of ML models.</p>
        <fig id="figure3" position="float">
          <label>Figure 3</label>
          <caption>
            <p>The number of workers who completed different numbers of human intelligence tasks (HITs). Most workers completed a relatively small number of HITs.</p>
          </caption>
          <graphic xlink:href="jmir_v24i1e28749_fig3.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <table-wrap position="float" id="table1">
          <label>Table 1</label>
          <caption>
            <p>Details of the collected labels and label consistency (LC) score for each of the physical activity, sleep quality, and sedentary behavior categories. LC ranges from 0 to 1, and the values close to 1 show more consistency among workers’ input.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="250"/>
            <col width="250"/>
            <col width="170"/>
            <col width="170"/>
            <col width="160"/>
            <thead>
              <tr valign="top">
                <td>Type</td>
                <td>Tweets, n (%)</td>
                <td>LC<sub>multi</sub></td>
                <td>LC<sub>binary</sub></td>
                <td>Workers, n (%)</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>Physical activity</td>
                <td>48,576 (49.2)</td>
                <td>0.54</td>
                <td>0.75</td>
                <td>232 (38)</td>
              </tr>
              <tr valign="top">
                <td>Sedentary behavior</td>
                <td>17,367 (17.6)</td>
                <td>0.55</td>
                <td>0.74</td>
                <td>157 (25.7)</td>
              </tr>
              <tr valign="top">
                <td>Sleep quality</td>
                <td>32,779 (33.2)</td>
                <td>0.58</td>
                <td>0.77</td>
                <td>221 (36.2)</td>
              </tr>
              <tr valign="top">
                <td>Total</td>
                <td>98,722 (100)</td>
                <td>0.56</td>
                <td>0.75</td>
                <td>610 (100)</td>
              </tr>
            </tbody>
          </table>
        </table-wrap>
      </sec>
      <sec>
        <title>Truth Inference</title>
        <p><xref ref-type="table" rid="table2">Table 2</xref> describes the ground truth data set of 9000 tweets that was used to train the truth inference models. <xref ref-type="table" rid="table3">Table 3</xref> lists the inference results obtained from the 4 unsupervised models and 7 supervised predictive models, including 2 DL models, on the ground truth data set. Each model was evaluated on both binary and multiclass versions of the data set for each PASS category. Among the unsupervised models for physical activity and sleep quality, DS and RY performed better than MV and GLAD for all performance metrics, whereas MV outperformed the other models on the sleep quality data set. Interestingly, for binary inference across all PASS categories, MV outperformed or performed just as well as the other methods, indicating the impact of task complexity on the performance of inference methods.</p>
        <p>DL<italic><sub>meta</sub></italic> outperforms other methods with the minimum number of false positives (precision: 78%) for the multiclass classification task, but other methods performed better with respect to recall, F1, and AUC<sub>PR</sub> metrics. Performance on each PASS data set for binary classification did not highlight any individual method constantly performing best. For example, whereas SVM showed the best performance for physical activity, KNN and LR outperformed other models for sleep quality and sedentary behavior, respectively. LR achieved superior performance across all data sets for the multiclass inference task. To analyze this further, we modified the hyperparameters of the LR algorithm presented in <xref ref-type="table" rid="table3">Table 3</xref> to stochastic average gradient solver and <italic>l<sub>2</sub></italic> regularization and the optimizer of the hybrid neural network to stochastic gradient descent and repeated the comparisons. LR still outperformed the neural network model by more than 2% in all metrics. The poor performance of the neural networks in this study could be attributed to the imbalanced ratio of data (per class) to the model parameters (ie, high variance).</p>
        <table-wrap position="float" id="table2">
          <label>Table 2</label>
          <caption>
            <p>Characteristics of the ground truth data set used to develop and evaluate the supervised and unsupervised inference models.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="30"/>
            <col width="30"/>
            <col width="190"/>
            <col width="0"/>
            <col width="250"/>
            <col width="0"/>
            <col width="250"/>
            <col width="0"/>
            <col width="250"/>
            <thead>
              <tr valign="top">
                <td colspan="4">Variable</td>
                <td colspan="2">Physical activity (n=4000)</td>
                <td colspan="2">Sedentary behavior (n=2000)</td>
                <td>Sleep quality (n=3000)</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td colspan="9">
                  <bold>Labels, n (%)</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td colspan="8">
                  <bold>Binary</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>
                  <break/>
                </td>
                <td>Yes</td>
                <td colspan="2">1629 (40.73)</td>
                <td colspan="2">726 (36.3)</td>
                <td colspan="2">1063 (35.43)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>
                  <break/>
                </td>
                <td>No</td>
                <td colspan="2">2371 (59.28)</td>
                <td colspan="2">1274 (63.7)</td>
                <td colspan="2">1937 (64.57)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td colspan="8">
                  <bold>Multiclass</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>
                  <break/>
                </td>
                <td>YY<sup>a</sup></td>
                <td colspan="2">1629 (40.73)</td>
                <td colspan="2">726 (36.3)</td>
                <td colspan="2">1063 (35.43)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>
                  <break/>
                </td>
                <td>YN<sup>b</sup></td>
                <td colspan="2">550 (13.75)</td>
                <td colspan="2">395 (19.75)</td>
                <td colspan="2">862 (28.73)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>
                  <break/>
                </td>
                <td>NY<sup>c</sup></td>
                <td colspan="2">179 (4.48)</td>
                <td colspan="2">19 (0.95)</td>
                <td colspan="2">52 (1.73)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>
                  <break/>
                </td>
                <td>NN<sup>d</sup></td>
                <td colspan="2">1642 (41.05)</td>
                <td colspan="2">860 (43)</td>
                <td colspan="2">1023 (34.1)</td>
              </tr>
              <tr valign="top">
                <td colspan="9">
                  <bold>Gender, n (%)</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td colspan="2">Female</td>
                <td colspan="2">1131 (28.28)</td>
                <td colspan="2">576 (28.80)</td>
                <td colspan="2">469 (15.63)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td colspan="2">Male</td>
                <td colspan="2">1980 (49.50)</td>
                <td colspan="2">906 (45.30)</td>
                <td colspan="2">490 (16.34)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td colspan="2">Unknown</td>
                <td colspan="2">889 (22.22)</td>
                <td colspan="2">518 (25.90)</td>
                <td colspan="2">2041 (68.03)</td>
              </tr>
              <tr valign="top">
                <td colspan="9">
                  <bold>Age range</bold>
                  <bold>(years), n (%)</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td colspan="2">≤18</td>
                <td colspan="2">204 (5.10)</td>
                <td colspan="2">170 (8.50)</td>
                <td colspan="2">150 (5)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td colspan="2">19-29</td>
                <td colspan="2">743 (18.58)</td>
                <td colspan="2">475 (23.75)</td>
                <td colspan="2">331 (11.03)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td colspan="2">30-39</td>
                <td colspan="2">897 (22.42)</td>
                <td colspan="2">365 (18.25)</td>
                <td colspan="2">249 (8.30)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td colspan="2">≥40</td>
                <td colspan="2">1267 (31.68)</td>
                <td colspan="2">472 (23.60)</td>
                <td colspan="2">229 (7.64)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td colspan="2">Unknown</td>
                <td colspan="2">889 (22.22)</td>
                <td colspan="2">518 (25.90)</td>
                <td colspan="2">2041 (68.03)</td>
              </tr>
              <tr valign="top">
                <td colspan="9">
                  <bold>Day of week, n (%)</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td colspan="2">Sunday</td>
                <td colspan="2">664 (16.60)</td>
                <td colspan="2">325 (16.25)</td>
                <td colspan="2">440 (14.66)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td colspan="2">Monday</td>
                <td colspan="2">595 (14.88)</td>
                <td colspan="2">307 (15.35)</td>
                <td colspan="2">440 (14.66)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td colspan="2">Tuesday</td>
                <td colspan="2">493 (12.32)</td>
                <td colspan="2">245 (12.25)</td>
                <td colspan="2">435 (14.50)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td colspan="2">Wednesday</td>
                <td colspan="2">504 (12.60)</td>
                <td colspan="2">278 (13.9)</td>
                <td colspan="2">)393 (13.10)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td colspan="2">Thursday</td>
                <td colspan="2">525 (13.12)</td>
                <td colspan="2">270 (13.50)</td>
                <td colspan="2">416 (13.86)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td colspan="2">Friday</td>
                <td colspan="2">531 (13.28)</td>
                <td colspan="2">274 (13.70)</td>
                <td colspan="2">421 (14.03)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td colspan="2">Saturday</td>
                <td colspan="2">668 (16.70)</td>
                <td colspan="2">283 (14.15)</td>
                <td colspan="2">2433 (14.43)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td colspan="2">Unknown</td>
                <td colspan="2">20 (0.50)</td>
                <td colspan="2">18 (0.90)</td>
                <td colspan="2">22 (0.76)</td>
              </tr>
              <tr valign="top">
                <td colspan="4">Time (24 hours), Q1-Q3</td>
                <td colspan="2">10-19</td>
                <td colspan="2">10-19</td>
                <td>5-18</td>
              </tr>
              <tr valign="top">
                <td colspan="4">Month (range)</td>
                <td colspan="2">February to July</td>
                <td colspan="2">April to September</td>
                <td>January to August</td>
              </tr>
              <tr valign="top">
                <td colspan="9">
                  <bold>Source, n (%)</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td colspan="2">Organization</td>
                <td colspan="2">563 (14.08)</td>
                <td colspan="2">179 (8.95)</td>
                <td colspan="2">97 (3.23)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td colspan="2">Users</td>
                <td colspan="2">3437 (85.93)</td>
                <td colspan="2">1821 (91.05)</td>
                <td colspan="2">2903 (96.77)</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table2fn1">
              <p><sup>a</sup>YY: self-reported and recent physical activity, sedentary behavior, and sleep quality experience.</p>
            </fn>
            <fn id="table2fn2">
              <p><sup>b</sup>YN: self-reported but not recent physical activity, sedentary behavior, and sleep quality experience.</p>
            </fn>
            <fn id="table2fn3">
              <p><sup>c</sup>NY: not self-reported but recent physical activity, sedentary behavior, and sleep quality experience.</p>
            </fn>
            <fn id="table2fn4">
              <p><sup>d</sup>NN: neither self-reported nor recent physical activity, sedentary behavior, and sleep quality experience.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <table-wrap position="float" id="table3">
          <label>Table 3</label>
          <caption>
            <p>Performance of the truth interference methods using a ground truth data set of 9000 labeled tweets: 4000 physical activity, 2000 sedentary behavior, and 3000 sleep quality tweets. The top 4 rows of each PASS (physical activity, sedentary behavior, and sleep quality) category represent the results of the applied unsupervised truth inference models.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="30"/>
            <col width="170"/>
            <col width="120"/>
            <col width="80"/>
            <col width="0"/>
            <col width="120"/>
            <col width="80"/>
            <col width="0"/>
            <col width="120"/>
            <col width="80"/>
            <col width="0"/>
            <col width="120"/>
            <col width="80"/>
            <thead>
              <tr valign="top">
                <td colspan="2">Tweets and method</td>
                <td colspan="3">Precision (%)</td>
                <td colspan="3">Recall (%)</td>
                <td colspan="3">F1 (%)</td>
                <td colspan="2">AUC<sub>PR</sub><sup>a</sup> (%)</td>
              </tr>
              <tr valign="top">
                <td colspan="2">
                  <break/>
                </td>
                <td>Multiclass</td>
                <td>Binary</td>
                <td colspan="2">Multiclass</td>
                <td>Binary</td>
                <td colspan="2">Multiclass</td>
                <td>Binary</td>
                <td colspan="2">Multiclass</td>
                <td>Binary</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td colspan="13">
                  <bold>Physical activity</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>MV<sup>b</sup></td>
                <td>72</td>
                <td>85</td>
                <td colspan="2">70</td>
                <td>
                  <italic>85</italic>
                  <sup>c</sup>
                </td>
                <td colspan="2">71</td>
                <td>84</td>
                <td colspan="2">56</td>
                <td>85</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>DS<sup>d</sup></td>
                <td>74</td>
                <td>85</td>
                <td colspan="2">68</td>
                <td>
                  <italic>85</italic>
                </td>
                <td colspan="2">70</td>
                <td>84</td>
                <td colspan="2">54</td>
                <td>85</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>GLAD<sup>e</sup></td>
                <td>73</td>
                <td>84</td>
                <td colspan="2">70</td>
                <td>84</td>
                <td colspan="2">71</td>
                <td>83</td>
                <td colspan="2">57</td>
                <td>84</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>RY<sup>f</sup></td>
                <td>74</td>
                <td>85</td>
                <td colspan="2">68</td>
                <td>
                  <italic>85</italic>
                </td>
                <td colspan="2">70</td>
                <td>84</td>
                <td colspan="2">54</td>
                <td>84</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>LR<sup>g</sup></td>
                <td>74</td>
                <td>85</td>
                <td colspan="2">
                  <italic>75</italic>
                </td>
                <td>
                  <italic>85</italic>
                </td>
                <td colspan="2">
                  <italic>74</italic>
                </td>
                <td>
                  <italic>85</italic>
                </td>
                <td colspan="2">
                  <italic>61</italic>
                </td>
                <td>87</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>KNN<sup>h</sup></td>
                <td>74</td>
                <td>85</td>
                <td colspan="2">74</td>
                <td>
                  <italic>85</italic>
                </td>
                <td colspan="2">73</td>
                <td>84</td>
                <td colspan="2">60</td>
                <td>
                  <italic>88</italic>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>SVM<sup>i</sup></td>
                <td>72</td>
                <td>
                  <italic>86</italic>
                </td>
                <td colspan="2">73</td>
                <td>
                  <italic>85</italic>
                </td>
                <td colspan="2">73</td>
                <td>
                  <italic>85</italic>
                </td>
                <td colspan="2">
                  <italic>61</italic>
                </td>
                <td>
                  <italic>88</italic>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>RF<sup>j</sup></td>
                <td>73</td>
                <td>85</td>
                <td colspan="2">74</td>
                <td>84</td>
                <td colspan="2">73</td>
                <td>
                  <italic>85</italic>
                </td>
                <td colspan="2">60</td>
                <td>87</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>XGBoost</td>
                <td>72</td>
                <td>81</td>
                <td colspan="2">72</td>
                <td>81</td>
                <td colspan="2">71</td>
                <td>81</td>
                <td colspan="2">58</td>
                <td>83</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>DL<sub>meta</sub><sup>k</sup></td>
                <td>
                  <italic>79</italic>
                </td>
                <td>84</td>
                <td colspan="2">68</td>
                <td>84</td>
                <td colspan="2">73</td>
                <td>84</td>
                <td colspan="2">60</td>
                <td>78</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>DL<sub>text_and_meta</sub></td>
                <td>78</td>
                <td>84</td>
                <td colspan="2">70</td>
                <td>84</td>
                <td colspan="2">73</td>
                <td>84</td>
                <td colspan="2">60</td>
                <td>78</td>
              </tr>
              <tr valign="top">
                <td colspan="13">
                  <bold>Sedentary behavior</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>MV</td>
                <td>71</td>
                <td>82</td>
                <td colspan="2">68</td>
                <td>82</td>
                <td colspan="2">68</td>
                <td>82</td>
                <td colspan="2">54</td>
                <td>80</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>DS</td>
                <td>70</td>
                <td>81</td>
                <td colspan="2">62</td>
                <td>81</td>
                <td colspan="2">65</td>
                <td>81</td>
                <td colspan="2">48</td>
                <td>79</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>GLAD</td>
                <td>71</td>
                <td>79</td>
                <td colspan="2">68</td>
                <td>79</td>
                <td colspan="2">68</td>
                <td>79</td>
                <td colspan="2">54</td>
                <td>77</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>RY</td>
                <td>70</td>
                <td>81</td>
                <td colspan="2">62</td>
                <td>81</td>
                <td colspan="2">65</td>
                <td>81</td>
                <td colspan="2">48</td>
                <td>79</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>LR</td>
                <td>72</td>
                <td>
                  <italic>83</italic>
                </td>
                <td colspan="2">
                  <italic>72</italic>
                </td>
                <td>
                  <italic>83</italic>
                </td>
                <td colspan="2">70</td>
                <td>
                  <italic>83</italic>
                </td>
                <td colspan="2">
                  <italic>58</italic>
                </td>
                <td>
                  <italic>81</italic>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>KNN</td>
                <td>71</td>
                <td>82</td>
                <td colspan="2">71</td>
                <td>82</td>
                <td colspan="2">67</td>
                <td>82</td>
                <td colspan="2">56</td>
                <td>80</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>SVM</td>
                <td>73</td>
                <td>
                  <italic>83</italic>
                </td>
                <td colspan="2">
                  <italic>72</italic>
                </td>
                <td>
                  <italic>83</italic>
                </td>
                <td colspan="2">70</td>
                <td>
                  <italic>83</italic>
                </td>
                <td colspan="2">
                  <italic>58</italic>
                </td>
                <td>
                  <italic>81</italic>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>RF</td>
                <td>72</td>
                <td>
                  <italic>83</italic>
                </td>
                <td colspan="2">
                  <italic>72</italic>
                </td>
                <td>82</td>
                <td colspan="2">69</td>
                <td>
                  <italic>83</italic>
                </td>
                <td colspan="2">57</td>
                <td>
                  <italic>81</italic>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>XGBoost</td>
                <td>68</td>
                <td>82</td>
                <td colspan="2">69</td>
                <td>82</td>
                <td colspan="2">67</td>
                <td>82</td>
                <td colspan="2">54</td>
                <td>80</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>DL<sub>meta</sub></td>
                <td>
                  <italic>78</italic>
                </td>
                <td>80</td>
                <td colspan="2">65</td>
                <td>80</td>
                <td colspan="2">
                  <italic>71</italic>
                </td>
                <td>80</td>
                <td colspan="2">56</td>
                <td>73</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>DL<sub>text/meta</sub></td>
                <td>
                  <italic>78</italic>
                </td>
                <td>80</td>
                <td colspan="2">65</td>
                <td>80</td>
                <td colspan="2">
                  <italic>71</italic>
                </td>
                <td>80</td>
                <td colspan="2">56</td>
                <td>75</td>
              </tr>
              <tr valign="top">
                <td colspan="13">
                  <bold>Sleep quality</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>MV</td>
                <td>78</td>
                <td>
                  <italic>89</italic>
                </td>
                <td colspan="2">74</td>
                <td>
                  <italic>89</italic>
                </td>
                <td colspan="2">75</td>
                <td>
                  <italic>89</italic>
                </td>
                <td colspan="2">61</td>
                <td>87</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>DS</td>
                <td>80</td>
                <td>
                  <italic>89</italic>
                </td>
                <td colspan="2">74</td>
                <td>
                  <italic>89</italic>
                </td>
                <td colspan="2">
                  <italic>77</italic>
                </td>
                <td>
                  <italic>89</italic>
                </td>
                <td colspan="2">62</td>
                <td>87</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>GLAD</td>
                <td>79</td>
                <td>85</td>
                <td colspan="2">75</td>
                <td>85</td>
                <td colspan="2">76</td>
                <td>85</td>
                <td colspan="2">62</td>
                <td>82</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>RY</td>
                <td>80</td>
                <td>
                  <italic>89</italic>
                </td>
                <td colspan="2">74</td>
                <td>
                  <italic>89</italic>
                </td>
                <td colspan="2">76</td>
                <td>
                  <italic>89</italic>
                </td>
                <td colspan="2">62</td>
                <td>87</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>LR</td>
                <td>76</td>
                <td>88</td>
                <td colspan="2">
                  <italic>77</italic>
                </td>
                <td>87</td>
                <td colspan="2">
                  <italic>77</italic>
                </td>
                <td>88</td>
                <td colspan="2">64</td>
                <td>88</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>KNN</td>
                <td>76</td>
                <td>
                  <italic>89</italic>
                </td>
                <td colspan="2">
                  <italic>77</italic>
                </td>
                <td>
                  <italic>89</italic>
                </td>
                <td colspan="2">
                  <italic>77</italic>
                </td>
                <td>
                  <italic>89</italic>
                </td>
                <td colspan="2">63</td>
                <td>
                  <italic>89</italic>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>SVM</td>
                <td>76</td>
                <td>88</td>
                <td colspan="2">
                  <italic>77</italic>
                </td>
                <td>88</td>
                <td colspan="2">
                  <italic>77</italic>
                </td>
                <td>88</td>
                <td colspan="2">64</td>
                <td>88</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>RF</td>
                <td>75</td>
                <td>
                  <italic>89</italic>
                </td>
                <td colspan="2">76</td>
                <td>
                  <italic>89</italic>
                </td>
                <td colspan="2">76</td>
                <td>
                  <italic>89</italic>
                </td>
                <td colspan="2">63</td>
                <td>
                  <italic>89</italic>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>XGBoost</td>
                <td>72</td>
                <td>87</td>
                <td colspan="2">72</td>
                <td>
                  <italic>89</italic>
                </td>
                <td colspan="2">72</td>
                <td>87</td>
                <td colspan="2">58</td>
                <td>87</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>DL<sub>meta</sub></td>
                <td>
                  <italic>82</italic>
                </td>
                <td>86</td>
                <td colspan="2">72</td>
                <td>86</td>
                <td colspan="2">76</td>
                <td>86</td>
                <td colspan="2">63</td>
                <td>81</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>DL<sub>text/meta</sub></td>
                <td>80</td>
                <td>87</td>
                <td colspan="2">72</td>
                <td>87</td>
                <td colspan="2">76</td>
                <td>87</td>
                <td colspan="2">
                  <italic>65</italic>
                </td>
                <td>82</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table3fn1">
              <p><sup>a</sup>AUC<sub>PR</sub>: precision-recall area under the curve.</p>
            </fn>
            <fn id="table3fn2">
              <p><sup>b</sup>MV: majority voting.</p>
            </fn>
            <fn id="table3fn3">
              <p><sup>c</sup>Italicization indicates best performance for the metric and each PASS (physical activity, sedentary behavior, and sleep quality) category.</p>
            </fn>
            <fn id="table3fn4">
              <p><sup>d</sup>DS: David and Skene.</p>
            </fn>
            <fn id="table3fn5">
              <p><sup>e</sup>GLAD: generative model of labels, abilities, and difficulties.</p>
            </fn>
            <fn id="table3fn6">
              <p><sup>f</sup>RY: Raykar algorithm.</p>
            </fn>
            <fn id="table3fn7">
              <p><sup>g</sup>LR: logistic regression.</p>
            </fn>
            <fn id="table3fn8">
              <p><sup>h</sup>KNN: K-nearest neighbors.</p>
            </fn>
            <fn id="table3fn9">
              <p><sup>i</sup>SVM: support vector machine.</p>
            </fn>
            <fn id="table3fn10">
              <p><sup>j</sup>RF: random forest.</p>
            </fn>
            <fn id="table3fn11">
              <p><sup>k</sup>DL: deep learning.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <p>Across all data sets, supervised models consistently performed better than unsupervised methods. This highlights the value of the context-sensitive information that was used as meta-information when training supervised models. However, on sleep quality, a data set with the same features and level of complexity as physical activity and sedentary behavior data sets, MV appears sufficient for the binary inference task, with supervised models providing little or no improvement.</p>
        <p>The hybrid CNN architecture did not provide any gain on either the unsupervised inference models or the supervised predictive models (ie, LR, KNN, SVM, RF, XGBoost, and DL<sub>meta</sub>), and in some ways, underperformed them. It is possible that the LSTM stream could not capture the underlying dynamics of the features because of the inconsistencies between the poorly labeled tasks and the textual features.</p>
      </sec>
      <sec>
        <title>Active Learning</title>
        <p>To further explore the feasibility of correcting mislabeled samples, we used pool-based active learning [<xref ref-type="bibr" rid="ref39">39</xref>] with uncertainty sampling. Pool-based active learning assumes that only a small set of data is labeled, and a large pool of data still needs to be labeled through an iterative learning process. All samples in the pool are queried based on an informativeness measure, which improves the learner’s discrimination ability [<xref ref-type="bibr" rid="ref40">40</xref>]. In this study, our learners were modeled to query the most ambivalent and uncertain samples. For example, for the binary label inference task, samples for which <italic>p</italic>(<inline-graphic xlink:href="jmir_v24i1e28749_fig15.png" xlink:type="simple" mimetype="image"/> = <italic>l</italic> &#124; <italic>f</italic>) ≈ 0.5 are the most informative samples that may help detect mislabeled samples of the data set through different iterations. We used 5 different base learners with different architectures (ie, RF, LR, KNN, SVM, and XGBoost) with a batch size of 5 and queried the unlabeled pool through 100 iterations.</p>
        <p>Our results show that, during the learning process, the accuracy of the classifiers generally increased, slightly degraded at some iterations, and stabilized around iteration 60 for KNN and iteration 20 for other classifiers (<xref rid="figure4" ref-type="fig">Figure 4</xref>). Although the active learners in this study could improve their predictive ability through a self-learning process, they failed to correct mislabeled samples and stabilized at performance scores lower than those of the offline learners discussed earlier (<xref ref-type="table" rid="table3">Table 3</xref>).</p>
        <fig id="figure4" position="float">
          <label>Figure 4</label>
          <caption>
            <p>Incremental classification accuracy using pool-based active learning. KNN: K-nearest neighbors; LR: logistic regression; RF: random forest; SVM: support vector machine; XGB: XGBoost.</p>
          </caption>
          <graphic xlink:href="jmir_v24i1e28749_fig4.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
    </sec>
    <sec sec-type="discussion">
      <title>Discussion</title>
      <sec>
        <title>Practical Recommendations</title>
        <p>We start this section with some practical recommendations and guidelines on the use of AMT in specific and crowdsourcing in general for developing ML-based public health surveillance systems. Even under the assumption that more advanced artificial intelligence models, including pretrained models on general scope data sets and transfer-learning techniques, can cope with the poor quality of crow-generated labels, the guidelines provided in this study can still improve the implementation, design, and qualification of the crowd-labeling as well as the label inference processes. These guidelines are supported by the results described earlier and the findings and further analysis discussed in the rest of this section.</p>
        <p>First, although the demographics of AMT workers are not available, we can still implement the crowdsourcing process in a way that accommodates a greater diversity of workers. A longitudinal labeling process, rather than one-time labeling, allows researchers to monitor the quality of the collected data over time, and mitigates the impact of spammers, irresponsible workers, and workers who are biased or mistake prone. Second, the overall quality of AMT workers can be context-sensitive and vary based on the type of labeling task. For example, the familiarity of the workers in the context of the tasks in the sleep quality data set, contrasting the broad context of physical activity and sedentary behavior concepts, resulted in higher data quality. Researchers should also be aware of the exclusion rate (eg, 5189/103,911, 4.99% in this study) and need to consider this when planning for their study’s budget and design. Third, our study results show that consensus-based inference models that do not consider the task’s features may not always be efficient for integrating crowdsourced labels and thus negatively impact the performance of ML models. Fourth, in addition to qualification requirements to filter crowdsourcing participants, sound and illustrative instruction is a less direct way to increase data quality. During the course of this project, we received nearly 70 emails from AMT workers, with most of them asked about scenarios that were mentioned in the instructions. This implies that the instruction changed their default understanding of the tasks, thereby improving the quality of the labels. Finally, when controlling the quality of workers using a qualification question, we recommend not informing the worker that this technique is being used, as they might guess the questions based on their simplicity.</p>
      </sec>
      <sec>
        <title>Key Findings</title>
        <sec>
          <title>Information Loss About Label Uncertainty</title>
          <p>Despite all the alternative models developed in this study to improve the inference accuracy, there were still considerable discrepancies between workers and the truth labels. These disagreements may be attributable to the underlying uncertainty in the data. Although reducing uncertainty by collecting more labels from more workers might simplify the process of label inference, it limits the learning ability of ML models in modeling the inherent uncertainty of data and prevents them from recovering from the mistakes made early during the inference process [<xref ref-type="bibr" rid="ref41">41</xref>].</p>
        </sec>
        <sec>
          <title>Robustness of Inference Models</title>
          <p>We observed from our inference results that, regardless of the type of the classification task, none of the 11 methods outperformed other methods across all data sets (<xref ref-type="table" rid="table3">Table 3</xref>). This indicates that inference methods are sensitive to data set characteristics. For example, the performance of all of the methods on the sleep quality data set is better than that of physical activity and sedentary behavior data sets, indicating the low robustness of these models against the task context.</p>
        </sec>
        <sec>
          <title>The Importance of Task Features</title>
          <p>Compared with supervised models that require a large volume of labeled data to integrate crowd-generated labels, using unsupervised inference models is simple and straightforward. However, this simplicity is gained through the cost of throwing away the contextual characteristics of tasks, which may sacrifice quality in context-sensitive scenarios. For example, the time that a tweet is posted during a day can contribute to the decision about its relevance to physical activity or sleep quality contexts. The importance of these characteristics was far more pronounced in the multiclass inference tasks than in the binary tasks (<xref ref-type="table" rid="table3">Table 3</xref>), suggesting the need for more complicated models when inferring the truth label of tasks with a high level of uncertainty.</p>
        </sec>
        <sec>
          <title>The Effectiveness of Qualification Requirements</title>
          <p>In this study, we used two levels of quality control: (1) through the task assignment process by accepting only workers with a master qualification and (2) through the design and implementation of the tasks by adding a qualification question to our HITs and iteratively observing workers’ performance based on their answer to this question. Our results show that even though defining these requirements improved the quality of crowd-generated labels to a great extent, 12.45% (498/4000), 13.3% (266/2000), and 7.7% (231/3000) of physical activity, sedentary behavior, and sleep quality tweets, respectively, were still mislabeled by all three workers, regardless of their context or complexity level, indicating the need for further quality assessment of crowdsourced data. These mislabeled samples were not misclassified due to sample uncertainty or difficulty, and our further analysis shows that they were not informative enough (ie, prediction scores) to improve the performance of predictive models through the iterative process of active learning (Figure S4 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). Considering the sparsity of the (workers and tasks) matrix in large-scale crowdsourcing tasks, distinguishing irresponsible workers and removing their impact is a challenging task that should be carefully considered when training ML models based on crowd-labeled data. A sample list of low-quality labels for all the PASS categories is provided in Figure S5 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p>
        </sec>
        <sec>
          <title>The Impact of Crowd-Generated Labels on the Performance of Predictive Models</title>
          <p>To further investigate the reliability of using crowdsourcing for developing ML models, we used bidirectional encoder representations from transformers [<xref ref-type="bibr" rid="ref42">42</xref>] (ie, bert-base-uncased); a transformer-based model with 12-layer, 768 hidden units, 12 heads; and 110M parameters as a contextual input to our DL model, to classify 4000 physical activity tweets, using our binary truth labels and crowd-generated labels. We used the labels inferred by SVM for the crowd-generated labels, as it outperformed other models on the physical activity data set (<xref ref-type="table" rid="table3">Table 3</xref>). Interestingly, the model that was trained on our ground truth data set outperformed the crowd-labeled data set on all performance metrics by at least 8% (eg, crowd-labeled: AUC<sub>PR</sub> of 72%; expert-labeled: AUC<sub>PR</sub> of 82%). This indicates the importance of the quality of crowd-generated labels in developing ML models designed for decision-making purposes, such as public health surveillance decisions.</p>
        </sec>
      </sec>
      <sec>
        <title>Label Prediction Explanation</title>
        <p>To interpret the results of our predictive models in terms of the individual contribution of each feature to the prediction results, we used SHAP [<xref ref-type="bibr" rid="ref23">23</xref>,<xref ref-type="bibr" rid="ref43">43</xref>]. SHAP calculates the local, instead of global, feature importance for each sample of the data set, which mitigates the risks associated with inconsistency problems in other feature importance techniques. <xref rid="figure5" ref-type="fig">Figure 5</xref>A illustrates the interpretation of the prediction using XGBoost on a randomly selected sample of the physical activity data set using SHAP. The red arrows show the features that contribute to the increase, and the blue arrows represent features that contribute to the decrease in the prediction. The width of each arrow indicates the height of its impact. From this example, we can see that <italic>l</italic><sub>1</sub>=1 and daytime=7pm have the most positive impact on the predicted label, whereas <italic>l</italic><sub>2</sub>=0 and age ≥40 has the most negative impact.</p>
        <fig id="figure5" position="float">
          <label>Figure 5</label>
          <caption>
            <p>The estimated impact of each piece of meta-information on XGBoost when predicting the truth label. Age is in years. D&#38;S: David and Skene; GLAD: generative model of labels, abilities, and difficulties; LFC: Learning from Crowds (Raykar algorithm); SHAP: Shapley additive explanations.</p>
          </caption>
          <graphic xlink:href="jmir_v24i1e28749_fig5.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <p>We further used Shapely values to cluster our data set based on the explanation similarity of samples, using hierarchical agglomerative clustering (<xref rid="figure5" ref-type="fig">Figure 5</xref>B). From this figure, we can see that the crowdsourced labels are the most influential features in grouping the samples in our data set. The highlighted areas in this diagram show the samples that have similar force plots, implying the dominant and similar contribution of these features across the physical activity data set.</p>
        <p>Using the additive nature of Shapley values, we integrated all the local feature values for each data point and calculated the global contribution (<italic>I</italic>) of each feature. Considering <inline-graphic xlink:href="jmir_v24i1e28749_fig11.png" xlink:type="simple" mimetype="image"/> as the Shapley value of feature <italic>j</italic> for sample <italic>i</italic>, we can calculate the global importance of this feature as <inline-graphic xlink:href="jmir_v24i1e28749_fig12.png" xlink:type="simple" mimetype="image"/>. <xref rid="figure5" ref-type="fig">Figure 5</xref>C shows the combination of feature importance (y-axis) and feature effects (colored points) for the most influential features, ordered based on their importance. This plot shows that crowdsourced labels (<italic>l<sub>1</sub>, l<sub>2</sub>,</italic> and <italic>l<sub>3</sub></italic>), followed by <italic>daytime</italic>, the results of the <italic>inference models</italic>, and <italic>gender</italic> have the greatest impact on the decision-making of XGBoost. From these results, which are extendable to the other predictive models developed in this study, it can be inferred that regardless of the complexity and the architecture of the predictive models, the crowd-generated labels are the factors that most influence predictive models’ prediction. Although meta-information such as <italic>daytime</italic> and <italic>gender</italic> are among the most contributing features (<xref rid="figure5" ref-type="fig">Figure 5</xref>C), they still cannot compete with the crowd-generated labels in most of the samples. This can explain the vulnerability of our ML and DL models to the noisy labels of the data set.</p>
        <p>To triangulate the dominant impact of the crowdsourced labels, we excluded all the samples for which <inline-graphic xlink:href="jmir_v24i1e28749_fig13.png" xlink:type="simple" mimetype="image"/> or from our data set for both supervised and unsupervised techniques and achieved an F<sub>1</sub> score of approximately 99%. This implies that inferring the truth label of crowdsourced data highly depends on the quality of the collected data from the crowd, and even advanced and complex predictive models might not be able to compensate for the poor quality of these data.</p>
      </sec>
      <sec>
        <title>Limitations</title>
        <p>This study has several limitations. First, the compensation paid to the workers could impact the quality of the collected labels, and consequently, the evaluation results of this study. Workers may show a higher quality in exchange for higher payments. To investigate this, during the course of the project, we increased HITs’ reward from US $0.03 to US $0.05 and did not notice any significant changes in quality. However, this is still debatable and requires further investigation.</p>
        <p>Second, to develop the supervised models, we assumed that all the tasks share the same level of complexity, whereas in reality, some examples are more difficult than others. For example, labeling “I can’t sleep” to a self-reported sleep problem is more straightforward than labeling “I’m kind of envious of anyone who is able to fall asleep before 2am.” We attempted to address this by incorporating inherent task difficulties in the prediction models by developing a hybrid CNN model. However, crowd-generated labels dominated other features of our data set, which had the greatest impact on their inference decisions. Building crowdsourcing models sensitive to the complexity of tasks to allocate more resources (workers) to more difficult tasks is a worthwhile direction for future research.</p>
        <p>Third, the way we designed and presented the HITs on AMT could impact the performance of workers in various ways. Considering the central role of people in maximizing the benefits of crowdsourcing services, human factors should be considered when designing crowdsourcing tasks [<xref ref-type="bibr" rid="ref41">41</xref>]. To address this, we added succinct, precise, and demonstrative instructions to each task and explained each label with an illustrative example (eg, Figure S6 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). In addition, through different iterations of data collection, we tweaked the design, presentation, and instructions to ensure that we met the basic usability requirements of task design and presentation.</p>
        <p>Fourth, we defined workers’ qualifications based only on their historical performance in completing HITs across AMT (ie, master qualification). Although this provided some degree of quality control on the collected labels, alternative qualification requirements such as workers’ education, work background, and language could have also impacted our study results. To further study the role of qualification filtering, we pilot-tested the labeling process without any qualification requirements for 4500 physical activity tasks. These tasks were completed in &#60;12 hours with a consistency score (<italic>LC</italic>) of &#60;0.5, implying the importance of workers’ quality in developing crowd-labeled intelligent systems.</p>
        <p>Fifth, various physical activities, based on their energy requirements in metabolic equivalents (METs), can be categorized into different movement behaviors, such as light (1.6-2.9 METs), moderate (3-5.9 METs), and vigorous (≥6 METs) [<xref ref-type="bibr" rid="ref44">44</xref>]. However, as the details provided by social media data may not be enough to calculate the MET values, in this study, we only used general terms related to physical activity (eg, physical fitness, exercise, household, sports, or occupational activities) to filter and form the physical activity subset. To ensure that the lists of contextual terms for filtering all the PASS categories are comprehensive enough, in addition to domain-specific ontologies and WordNet [<xref ref-type="bibr" rid="ref45">45</xref>], we used NLP techniques (eg, topic modeling, language modeling, and lexical analysis) to detect latent word patterns that can be used to identify PASS-related contexts in unstructured text. However, with no impact on the methodology and results of this study, both data collection and population biases (inherent in social media data) should be considered when discussing the data set used for this study.</p>
        <p>Despite these limitations, our study is one of the first to rigorously investigate the challenges of using crowdsourcing to develop ML-based public health surveillance systems. Our findings support the argument that crowdsourcing, despite its low cost and short turnaround time, yields noisier data than in-house labeling. On the flip side, crowdsourcing can reduce annotation bias by involving a more diverse set of annotators [<xref ref-type="bibr" rid="ref41">41</xref>]. This diversity, supported by the diversity of AMT workers [<xref ref-type="bibr" rid="ref46">46</xref>], is highly beneficial to subjective labeling tasks, such as detecting a sedentary behavior based on a short text, which highly depends on the worker’s understanding of sedentary lifestyles.</p>
        <p>The results of this study may inspire future research to investigate and evaluate the application of crowdsourcing for the development of ML-based digital public health surveillance systems deployed and used in national surveillance decision-making. As the potential for success of ML-based digital public health surveillance relies on robust and reliable data sets, a sensitivity analysis of health-related incidents detected by ML-based surveillance models trained on crowd-generated labels versus relevant national datasets is required to ascertain this potential. Moreover, to assess whether our conclusions are sensitive to the background and expertise of participants, further investigation is required using a cohort of experts who are familiar with the public health context under study. Likewise, to untangle the effect of task context and the quality of the crow-generated labels, replicating the approach adopted in this study using other domains, including other public health domains, remains a future work. Finally, as there is a chance that the quality of the crowd-generated labels is subject to the compensation amount, confounded by the socioeconomic characteristics of the participant cohort, future investigations are required to calibrate the results of this study considering these factors.</p>
      </sec>
    </sec>
  </body>
  <back>
    <app-group>
      <supplementary-material id="app1">
        <label>Multimedia Appendix 1</label>
        <p>Additional figures that describe the Amazon Mechanical Turk labeling task, predictive model performance, and incorrectly labeled tweets in more detail.</p>
        <media xlink:href="jmir_v24i1e28749_app1.pdf" xlink:title="PDF File  (Adobe PDF File), 1434 KB"/>
      </supplementary-material>
    </app-group>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">AMT</term>
          <def>
            <p>Amazon Mechanical Turk</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb2">AUCPR</term>
          <def>
            <p>precision-recall area under the curve</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb3">CNN</term>
          <def>
            <p>convolutional neural network</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb4">DL</term>
          <def>
            <p>deep learning</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb5">DS</term>
          <def>
            <p>David and Skene</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb6">EM</term>
          <def>
            <p>expectation–maximization</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb7">GLAD</term>
          <def>
            <p>generative model of labels, abilities, and difficulties</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb8">HIT</term>
          <def>
            <p>Human Intelligence Task</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb9">KNN</term>
          <def>
            <p>K-nearest neighbors</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb10">LR</term>
          <def>
            <p>logistic regression</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb11">LSTM</term>
          <def>
            <p>long short-term memory</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb12">MET</term>
          <def>
            <p>metabolic equivalent</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb13">ML</term>
          <def>
            <p>machine learning</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb14">MV</term>
          <def>
            <p>majority voting</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb15">NLP</term>
          <def>
            <p>natural language processing</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb16">PASS</term>
          <def>
            <p>physical activity, sedentary behavior, and sleep quality</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb17">ReLU</term>
          <def>
            <p>Rectified Linear Unit</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb18">RY</term>
          <def>
            <p>Raykar algorithm</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb19">SHAP</term>
          <def>
            <p>Shapley Additive Explanations</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb20">SMOTE</term>
          <def>
            <p>Synthetic Minority Oversampling Technique-Nominal Continuous</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb21">SVM</term>
          <def>
            <p>support vector machine</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <ack>
      <p>This work was supported by a postdoctoral scholarship from the Libin Cardiovascular Institute and the Cumming School of Medicine, University of Calgary. This work was also supported by a Discovery Grant from the Natural Sciences and Engineering Research Council of Canada (RGPIN-2014-04743). The Public Health Agency of Canada funded the Amazon Mechanical Turk costs. The funders of the study had no role in the study design, data collection and analysis, interpretation of results, and preparation of the manuscript.</p>
    </ack>
    <fn-group>
      <fn fn-type="con">
        <p>ZSHA was responsible for data collection and curation, model development, data analysis, and visualization, and wrote the paper. GPB and WT reviewed the paper and provided comments. JL conceived and designed the study and revised the manuscript.</p>
      </fn>
      <fn fn-type="conflict">
        <p>None declared.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Mavragani</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Infodemiology and infoveillance: scoping review</article-title>
          <source>J Med Internet Res</source>
          <year>2020</year>
          <month>04</month>
          <day>28</day>
          <volume>22</volume>
          <issue>4</issue>
          <fpage>e16206</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.jmir.org/2020/4/e16206/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/16206</pub-id>
          <pub-id pub-id-type="medline">32310818</pub-id>
          <pub-id pub-id-type="pii">v22i4e16206</pub-id>
          <pub-id pub-id-type="pmcid">PMC7189791</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Aiello</surname>
              <given-names>AE</given-names>
            </name>
            <name name-style="western">
              <surname>Renson</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Zivich</surname>
              <given-names>PN</given-names>
            </name>
          </person-group>
          <article-title>Social media- and internet-based disease surveillance for public health</article-title>
          <source>Annu Rev Public Health</source>
          <year>2020</year>
          <month>04</month>
          <day>02</day>
          <volume>41</volume>
          <issue>1</issue>
          <fpage>101</fpage>
          <lpage>18</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/31905322"/>
          </comment>
          <pub-id pub-id-type="doi">10.1146/annurev-publhealth-040119-094402</pub-id>
          <pub-id pub-id-type="medline">31905322</pub-id>
          <pub-id pub-id-type="pmcid">PMC7959655</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Sinnenberg</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Buttenheim</surname>
              <given-names>AM</given-names>
            </name>
            <name name-style="western">
              <surname>Padrez</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Mancheno</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Ungar</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Merchant</surname>
              <given-names>RM</given-names>
            </name>
          </person-group>
          <article-title>Twitter as a Tool for Health Research: A Systematic Review</article-title>
          <source>Am J Public Health</source>
          <year>2017</year>
          <month>01</month>
          <volume>107</volume>
          <issue>1</issue>
          <fpage>e1</fpage>
          <lpage>e8</lpage>
          <pub-id pub-id-type="doi">10.2105/AJPH.2016.303512</pub-id>
          <pub-id pub-id-type="medline">27854532</pub-id>
          <pub-id pub-id-type="pmcid">PMC5308155</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Bernardo</surname>
              <given-names>TM</given-names>
            </name>
            <name name-style="western">
              <surname>Rajic</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Young</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Robiadek</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Pham</surname>
              <given-names>MT</given-names>
            </name>
            <name name-style="western">
              <surname>Funk</surname>
              <given-names>JA</given-names>
            </name>
          </person-group>
          <article-title>Scoping review on search queries and social media for disease surveillance: a chronology of innovation</article-title>
          <source>J Med Internet Res</source>
          <year>2013</year>
          <month>07</month>
          <day>18</day>
          <volume>15</volume>
          <issue>7</issue>
          <fpage>e147</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.jmir.org/2013/7/e147/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/jmir.2740</pub-id>
          <pub-id pub-id-type="medline">23896182</pub-id>
          <pub-id pub-id-type="pii">v15i7e147</pub-id>
          <pub-id pub-id-type="pmcid">PMC3785982</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hossain</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Kam</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Kong</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Wigand</surname>
              <given-names>RT</given-names>
            </name>
            <name name-style="western">
              <surname>Bossomaier</surname>
              <given-names>T</given-names>
            </name>
          </person-group>
          <article-title>Social media in Ebola outbreak</article-title>
          <source>Epidemiol Infect</source>
          <year>2016</year>
          <month>07</month>
          <volume>144</volume>
          <issue>10</issue>
          <fpage>2136</fpage>
          <lpage>43</lpage>
          <pub-id pub-id-type="doi">10.1017/S095026881600039X</pub-id>
          <pub-id pub-id-type="medline">26939535</pub-id>
          <pub-id pub-id-type="pii">S095026881600039X</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref6">
        <label>6</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lardon</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Abdellaoui</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Bellet</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Asfari</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Souvignet</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Texier</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Jaulent</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Beyens</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Burgun</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Bousquet</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>Adverse drug reaction identification and extraction in social media: a scoping review</article-title>
          <source>J Med Internet Res</source>
          <year>2015</year>
          <month>07</month>
          <day>10</day>
          <volume>17</volume>
          <issue>7</issue>
          <fpage>e171</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.jmir.org/2015/7/e171/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/jmir.4304</pub-id>
          <pub-id pub-id-type="medline">26163365</pub-id>
          <pub-id pub-id-type="pii">v17i7e171</pub-id>
          <pub-id pub-id-type="pmcid">PMC4526988</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref7">
        <label>7</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hu</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Phan</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Chun</surname>
              <given-names>SA</given-names>
            </name>
            <name name-style="western">
              <surname>Geller</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Vo</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Ye</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Jin</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Ding</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Kenne</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Dou</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>An insight analysis and detection of drug-abuse risk behavior on Twitter with self-taught deep learning</article-title>
          <source>Comput Soc Netw</source>
          <year>2019</year>
          <month>11</month>
          <day>06</day>
          <volume>6</volume>
          <issue>1</issue>
          <fpage>10</fpage>
          <pub-id pub-id-type="doi">10.1186/s40649-019-0071-4</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref8">
        <label>8</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Cavallo</surname>
              <given-names>DN</given-names>
            </name>
            <name name-style="western">
              <surname>Tate</surname>
              <given-names>DF</given-names>
            </name>
            <name name-style="western">
              <surname>Ries</surname>
              <given-names>AV</given-names>
            </name>
            <name name-style="western">
              <surname>Brown</surname>
              <given-names>JD</given-names>
            </name>
            <name name-style="western">
              <surname>DeVellis</surname>
              <given-names>RF</given-names>
            </name>
            <name name-style="western">
              <surname>Ammerman</surname>
              <given-names>AS</given-names>
            </name>
          </person-group>
          <article-title>A social media-based physical activity intervention: a randomized controlled trial</article-title>
          <source>Am J Prev Med</source>
          <year>2012</year>
          <month>11</month>
          <volume>43</volume>
          <issue>5</issue>
          <fpage>527</fpage>
          <lpage>32</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/23079176"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.amepre.2012.07.019</pub-id>
          <pub-id pub-id-type="medline">23079176</pub-id>
          <pub-id pub-id-type="pii">S0749-3797(12)00520-X</pub-id>
          <pub-id pub-id-type="pmcid">PMC3479432</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref9">
        <label>9</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Dunn</surname>
              <given-names>AG</given-names>
            </name>
            <name name-style="western">
              <surname>Mandl</surname>
              <given-names>KD</given-names>
            </name>
            <name name-style="western">
              <surname>Coiera</surname>
              <given-names>E</given-names>
            </name>
          </person-group>
          <article-title>Social media interventions for precision public health: promises and risks</article-title>
          <source>NPJ Digit Med</source>
          <year>2018</year>
          <month>9</month>
          <day>19</day>
          <volume>1</volume>
          <issue>1</issue>
          <fpage>1</fpage>
          <lpage>4</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1038/s41746-018-0054-0"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/s41746-018-0054-0</pub-id>
          <pub-id pub-id-type="medline">30854472</pub-id>
          <pub-id pub-id-type="pmcid">PMC6402501</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref10">
        <label>10</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Raghupathi</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Raghupathi</surname>
              <given-names>V</given-names>
            </name>
          </person-group>
          <article-title>Big data analytics in healthcare: promise and potential</article-title>
          <source>Health Inf Sci Syst</source>
          <year>2014</year>
          <month>2</month>
          <day>7</day>
          <volume>2</volume>
          <issue>1</issue>
          <fpage>3</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/25825667"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/2047-2501-2-3</pub-id>
          <pub-id pub-id-type="medline">25825667</pub-id>
          <pub-id pub-id-type="pii">14</pub-id>
          <pub-id pub-id-type="pmcid">PMC4341817</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref11">
        <label>11</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Shakeri Hossein Abad</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Kline</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Sultana</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Noaeen</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Nurmambetova</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Lucini</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Al-Jefri</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Digital public health surveillance: a systematic scoping review</article-title>
          <source>NPJ Digit Med</source>
          <year>2021</year>
          <month>03</month>
          <day>03</day>
          <volume>4</volume>
          <issue>1</issue>
          <fpage>41</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1038/s41746-021-00407-6"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/s41746-021-00407-6</pub-id>
          <pub-id pub-id-type="medline">33658681</pub-id>
          <pub-id pub-id-type="pii">10.1038/s41746-021-00407-6</pub-id>
          <pub-id pub-id-type="pmcid">PMC7930261</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref12">
        <label>12</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Paolacci</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Chandler</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Ipeirotis</surname>
              <given-names>P</given-names>
            </name>
          </person-group>
          <article-title>Running experiments on Amazon Mechanical Turk</article-title>
          <source>Judgm Dec Mak</source>
          <year>2010</year>
          <volume>5</volume>
          <issue>5</issue>
          <fpage>1</fpage>
          <lpage>9</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://journal.sjdm.org/10/10630a/jdm10630a.pdf"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref13">
        <label>13</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Peer</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Brandimarte</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Samat</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Acquisti</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Beyond the Turk: alternative platforms for crowdsourcing behavioral research</article-title>
          <source>J Experiment Soc Psychol</source>
          <year>2017</year>
          <month>05</month>
          <volume>70</volume>
          <fpage>153</fpage>
          <lpage>63</lpage>
          <pub-id pub-id-type="doi">10.1016/j.jesp.2017.01.006</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref14">
        <label>14</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Brabham</surname>
              <given-names>DC</given-names>
            </name>
            <name name-style="western">
              <surname>Ribisl</surname>
              <given-names>KM</given-names>
            </name>
            <name name-style="western">
              <surname>Kirchner</surname>
              <given-names>TR</given-names>
            </name>
            <name name-style="western">
              <surname>Bernhardt</surname>
              <given-names>JM</given-names>
            </name>
          </person-group>
          <article-title>Crowdsourcing applications for public health</article-title>
          <source>Am J Prev Med</source>
          <year>2014</year>
          <month>02</month>
          <volume>46</volume>
          <issue>2</issue>
          <fpage>179</fpage>
          <lpage>87</lpage>
          <pub-id pub-id-type="doi">10.1016/j.amepre.2013.10.016</pub-id>
          <pub-id pub-id-type="medline">24439353</pub-id>
          <pub-id pub-id-type="pii">S0749-3797(13)00589-8</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref15">
        <label>15</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>SJ</given-names>
            </name>
            <name name-style="western">
              <surname>Marsch</surname>
              <given-names>LA</given-names>
            </name>
            <name name-style="western">
              <surname>Hancock</surname>
              <given-names>JT</given-names>
            </name>
            <name name-style="western">
              <surname>Das</surname>
              <given-names>AK</given-names>
            </name>
          </person-group>
          <article-title>Scaling up research on drug abuse and addiction through social media big data</article-title>
          <source>J Med Internet Res</source>
          <year>2017</year>
          <month>10</month>
          <day>31</day>
          <volume>19</volume>
          <issue>10</issue>
          <fpage>e353</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.jmir.org/2017/10/e353/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/jmir.6426</pub-id>
          <pub-id pub-id-type="medline">29089287</pub-id>
          <pub-id pub-id-type="pii">v19i10e353</pub-id>
          <pub-id pub-id-type="pmcid">PMC5686417</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref16">
        <label>16</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lu</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Guttentag</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Elbel</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Kiszko</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Abrams</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Kirchner</surname>
              <given-names>TR</given-names>
            </name>
          </person-group>
          <article-title>Crowdsourcing for food purchase receipt annotation via Amazon Mechanical Turk: a feasibility study</article-title>
          <source>J Med Internet Res</source>
          <year>2019</year>
          <month>04</month>
          <day>05</day>
          <volume>21</volume>
          <issue>4</issue>
          <fpage>e12047</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.jmir.org/2019/4/e12047/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/12047</pub-id>
          <pub-id pub-id-type="medline">30950801</pub-id>
          <pub-id pub-id-type="pii">v21i4e12047</pub-id>
          <pub-id pub-id-type="pmcid">PMC6473207</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref17">
        <label>17</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ayers</surname>
              <given-names>JW</given-names>
            </name>
            <name name-style="western">
              <surname>Leas</surname>
              <given-names>EC</given-names>
            </name>
            <name name-style="western">
              <surname>Allem</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Benton</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Dredze</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Althouse</surname>
              <given-names>BM</given-names>
            </name>
            <name name-style="western">
              <surname>Cruz</surname>
              <given-names>TB</given-names>
            </name>
            <name name-style="western">
              <surname>Unger</surname>
              <given-names>JB</given-names>
            </name>
          </person-group>
          <article-title>Why do people use electronic nicotine delivery systems (electronic cigarettes)? A content analysis of Twitter, 2012-2015</article-title>
          <source>PLoS One</source>
          <year>2017</year>
          <month>3</month>
          <day>1</day>
          <volume>12</volume>
          <issue>3</issue>
          <fpage>e0170702</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://dx.plos.org/10.1371/journal.pone.0170702"/>
          </comment>
          <pub-id pub-id-type="doi">10.1371/journal.pone.0170702</pub-id>
          <pub-id pub-id-type="medline">28248987</pub-id>
          <pub-id pub-id-type="pii">PONE-D-16-28731</pub-id>
          <pub-id pub-id-type="pmcid">PMC5331961</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref18">
        <label>18</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Yin</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Fabbri</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Rosenbloom</surname>
              <given-names>ST</given-names>
            </name>
            <name name-style="western">
              <surname>Malin</surname>
              <given-names>B</given-names>
            </name>
          </person-group>
          <article-title>A scalable framework to detect personal health mentions on Twitter</article-title>
          <source>J Med Internet Res</source>
          <year>2015</year>
          <month>06</month>
          <day>05</day>
          <volume>17</volume>
          <issue>6</issue>
          <fpage>e138</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.jmir.org/2015/6/e138/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/jmir.4305</pub-id>
          <pub-id pub-id-type="medline">26048075</pub-id>
          <pub-id pub-id-type="pii">v17i6e138</pub-id>
          <pub-id pub-id-type="pmcid">PMC4526910</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref19">
        <label>19</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>McIver</surname>
              <given-names>DJ</given-names>
            </name>
            <name name-style="western">
              <surname>Hawkins</surname>
              <given-names>JB</given-names>
            </name>
            <name name-style="western">
              <surname>Chunara</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Chatterjee</surname>
              <given-names>AK</given-names>
            </name>
            <name name-style="western">
              <surname>Bhandari</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Fitzgerald</surname>
              <given-names>TP</given-names>
            </name>
            <name name-style="western">
              <surname>Jain</surname>
              <given-names>SH</given-names>
            </name>
            <name name-style="western">
              <surname>Brownstein</surname>
              <given-names>JS</given-names>
            </name>
          </person-group>
          <article-title>Characterizing sleep issues using Twitter</article-title>
          <source>J Med Internet Res</source>
          <year>2015</year>
          <month>06</month>
          <day>08</day>
          <volume>17</volume>
          <issue>6</issue>
          <fpage>e140</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.jmir.org/2015/6/e140/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/jmir.4476</pub-id>
          <pub-id pub-id-type="medline">26054530</pub-id>
          <pub-id pub-id-type="pii">v17i6e140</pub-id>
          <pub-id pub-id-type="pmcid">PMC4526927</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref20">
        <label>20</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Reece</surname>
              <given-names>AG</given-names>
            </name>
            <name name-style="western">
              <surname>Reagan</surname>
              <given-names>AJ</given-names>
            </name>
            <name name-style="western">
              <surname>Lix</surname>
              <given-names>KL</given-names>
            </name>
            <name name-style="western">
              <surname>Dodds</surname>
              <given-names>PS</given-names>
            </name>
            <name name-style="western">
              <surname>Danforth</surname>
              <given-names>CM</given-names>
            </name>
            <name name-style="western">
              <surname>Langer</surname>
              <given-names>EJ</given-names>
            </name>
          </person-group>
          <article-title>Forecasting the onset and course of mental illness with Twitter data</article-title>
          <source>Sci Rep</source>
          <year>2017</year>
          <month>10</month>
          <day>11</day>
          <volume>7</volume>
          <issue>1</issue>
          <fpage>13006</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1038/s41598-017-12961-9"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/s41598-017-12961-9</pub-id>
          <pub-id pub-id-type="medline">29021528</pub-id>
          <pub-id pub-id-type="pii">10.1038/s41598-017-12961-9</pub-id>
          <pub-id pub-id-type="pmcid">PMC5636873</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref21">
        <label>21</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Adrover</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Bodnar</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Huang</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Telenti</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Salathé</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Identifying adverse effects of HIV drug treatment and associated sentiments using Twitter</article-title>
          <source>JMIR Public Health Surveill</source>
          <year>2015</year>
          <month>07</month>
          <day>27</day>
          <volume>1</volume>
          <issue>2</issue>
          <fpage>e7</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://publichealth.jmir.org/2015/2/e7/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/publichealth.4488</pub-id>
          <pub-id pub-id-type="medline">27227141</pub-id>
          <pub-id pub-id-type="pii">v1i2e7</pub-id>
          <pub-id pub-id-type="pmcid">PMC4869211</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref22">
        <label>22</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Peer</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Vosgerau</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Acquisti</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Reputation as a sufficient condition for data quality on Amazon Mechanical Turk</article-title>
          <source>Behav Res</source>
          <year>2013</year>
          <month>12</month>
          <day>20</day>
          <volume>46</volume>
          <issue>4</issue>
          <fpage>1023</fpage>
          <lpage>31</lpage>
          <pub-id pub-id-type="doi">10.3758/s13428-013-0434-y</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref23">
        <label>23</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lundberg</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>A unified approach to interpreting model predictions - advances in neural information processing systems</article-title>
          <source>Proceedings of the 31st Conference on Neural Information Processing Systems (NIPS 2017)</source>
          <year>2017</year>
          <conf-name>31st Conference on Neural Information Processing Systems (NIPS 2017)</conf-name>
          <conf-date>2017</conf-date>
          <conf-loc>Long Beach, CA, USA</conf-loc>
          <fpage>4765</fpage>
          <lpage>74</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://proceedings.neurips.cc/paper/2017/file/8a20a8621978632d76c43dfd28b67767-Paper.pdf"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref24">
        <label>24</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Hale</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Adelani</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Grabowicz</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Hartman</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Flöck</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Jurgens</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>Demographic inference and representative population estimates from multilingual social media data</article-title>
          <source>Proceedings of the World Wide Web Conference</source>
          <year>2019</year>
          <conf-name>WWW '19: The Web Conference</conf-name>
          <conf-date>May 13 - 17, 2019</conf-date>
          <conf-loc>San Francisco CA USA</conf-loc>
          <fpage>2056</fpage>
          <lpage>67</lpage>
          <pub-id pub-id-type="doi">10.1145/3308558.3313684</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref25">
        <label>25</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Shakeri Hossein Abad</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Butler</surname>
              <given-names>GP</given-names>
            </name>
            <name name-style="western">
              <surname>Thompson</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Physical activity, sedentary behaviour, and sleep on Twitter: a multicountry and fully labelled dataset for public health surveillance research</article-title>
          <source>JMIR Preprints.</source>
          <comment>Preprint posted online July 23, 2021
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://preprints.jmir.org/preprint/32355/accepted"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/32355</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref26">
        <label>26</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zheng</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Shan</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Cheng</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>Truth inference in crowdsourcing: is the problem solved?</article-title>
          <source>Proc VLDB Endow</source>
          <year>2017</year>
          <month>01</month>
          <volume>10</volume>
          <issue>5</issue>
          <fpage>541</fpage>
          <lpage>52</lpage>
          <pub-id pub-id-type="doi">10.14778/3055540.3055547</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref27">
        <label>27</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Sheshadri</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Lease</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Square: a benchmark for research on computing crowd consensus</article-title>
          <source>Proceedings of the First AAAI Conference on Human Computation and Crowdsourcing</source>
          <year>2013</year>
          <conf-name>First AAAI Conference on Human Computation and Crowdsourcing</conf-name>
          <conf-date>November 7-9, 2013</conf-date>
          <conf-loc>Palm Springs, California, USA</conf-loc>
          <fpage>156</fpage>
          <lpage>64</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://ojs.aaai.org/index.php/HCOMP/article/view/13088"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref28">
        <label>28</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Dawid</surname>
              <given-names>AP</given-names>
            </name>
            <name name-style="western">
              <surname>Skene</surname>
              <given-names>AM</given-names>
            </name>
          </person-group>
          <article-title>Maximum likelihood estimation of observer error-rates using the EM algorithm</article-title>
          <source>Appl Stat</source>
          <year>1979</year>
          <volume>28</volume>
          <issue>1</issue>
          <fpage>20</fpage>
          <pub-id pub-id-type="doi">10.2307/2346806</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref29">
        <label>29</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ipeirotis</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Provost</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Quality management on Amazon Mechanical Turk</article-title>
          <source>Proceedings of the ACM SIGKDD Workshop on Human Computation</source>
          <year>2010</year>
          <conf-name>KDD '10: The 16th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining</conf-name>
          <conf-date>July 25, 2010</conf-date>
          <conf-loc>Washington DC</conf-loc>
          <fpage>64</fpage>
          <lpage>7</lpage>
          <pub-id pub-id-type="doi">10.1145/1837885.1837906</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref30">
        <label>30</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Whitehill</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Bergsma</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Movellan</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Ruvolo</surname>
              <given-names>P</given-names>
            </name>
          </person-group>
          <article-title>Whose vote should count more: optimal integration of labels from labelers of unknown expertise</article-title>
          <source>Proceedings of the 23rd Annual Conference on Neural Information Processing Systems</source>
          <year>2009</year>
          <conf-name>23rd Annual Conference on Neural Information Processing Systems</conf-name>
          <conf-date>December 7-10, 2009</conf-date>
          <conf-loc>Vancouver, British Columbia Canada</conf-loc>
          <fpage>2035</fpage>
          <lpage>43</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://papers.nips.cc/paper/2009/hash/f899139df5e1059396431415e770c6dd-Abstract.html"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref31">
        <label>31</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Raykar</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Yu</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Zhao</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Valadez</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Florin</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Bogoni</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Moy</surname>
              <given-names>L</given-names>
            </name>
          </person-group>
          <article-title>Learning from crowds</article-title>
          <source>J Mach Learn Res</source>
          <year>2010</year>
          <volume>11</volume>
          <issue>43</issue>
          <fpage>1297</fpage>
          <lpage>322</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://jmlr.csail.mit.edu/papers/v11/raykar10a.html"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref32">
        <label>32</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Pennington</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Socher</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Manning</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>Glove: Global vectors for word representation</article-title>
          <source>Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing (EMNLP)</source>
          <year>2014</year>
          <conf-name>Conference on Empirical Methods in Natural Language Processing (EMNLP)</conf-name>
          <conf-date>October 2014</conf-date>
          <conf-loc>Doha, Qatar</conf-loc>
          <fpage>1532</fpage>
          <lpage>43</lpage>
          <pub-id pub-id-type="doi">10.3115/v1/d14-1162</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref33">
        <label>33</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Chawla</surname>
              <given-names>NV</given-names>
            </name>
            <name name-style="western">
              <surname>Bowyer</surname>
              <given-names>KW</given-names>
            </name>
            <name name-style="western">
              <surname>Hall</surname>
              <given-names>LO</given-names>
            </name>
            <name name-style="western">
              <surname>Kegelmeyer</surname>
              <given-names>WP</given-names>
            </name>
          </person-group>
          <article-title>SMOTE: Synthetic Minority Over-sampling Technique</article-title>
          <source>J Artif Intell Res</source>
          <year>2002</year>
          <month>06</month>
          <day>01</day>
          <volume>16</volume>
          <fpage>321</fpage>
          <lpage>57</lpage>
          <pub-id pub-id-type="doi">10.1613/jair.953</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref34">
        <label>34</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Snoek</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Larochelle</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Adams</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>Practical bayesian optimization of machine learning algorithms</article-title>
          <source>Proceedings of the Advances in Neural Information Processing Systems 25 (NIPS 2012)</source>
          <year>2012</year>
          <conf-name>Advances in Neural Information Processing Systems 25 (NIPS 2012)</conf-name>
          <conf-date>December 3-6, 2012</conf-date>
          <conf-loc>Lake Tahoe, Nevada, USA</conf-loc>
          <fpage>2951</fpage>
          <lpage>9</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://proceedings.neurips.cc/paper/2012/hash/05311655a15b75fab86956663e1819cd-Abstract.html"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref35">
        <label>35</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Abadi</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Barham</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Davis</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Dean</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Devin</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Ghemawat</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Irving</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Isard</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Kudlur</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Levenberg</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Monga</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Moore</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Murray</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Steiner</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Tucker</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Vasudevan</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Warden</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Wicke</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Yu</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Zheng</surname>
              <given-names>X</given-names>
            </name>
          </person-group>
          <article-title>TensorFlow: a system for large-scale machine learning</article-title>
          <source>Proceedings of the 12th USENIX Conference on Operating Systems Design and Implementation</source>
          <year>2016</year>
          <conf-name>12th USENIX Conference on Operating Systems Design and Implementation</conf-name>
          <conf-date>November 2 - 4, 2016</conf-date>
          <conf-loc>Savannah GA USA</conf-loc>
          <fpage>265</fpage>
          <lpage>83</lpage>
          <pub-id pub-id-type="doi">10.5555/3026877.3026899</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref36">
        <label>36</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Chollet</surname>
              <given-names>F</given-names>
            </name>
          </person-group>
          <article-title>Keras: the python deep learning library</article-title>
          <source>Astrophysics Source Code Library</source>
          <year>2018</year>
          <access-date>2021-12-11</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://ui.adsabs.harvard.edu/abs/2018ascl.soft06022C/abstract">https://ui.adsabs.harvard.edu/abs/2018ascl.soft06022C/abstract</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref37">
        <label>37</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Pedregosa</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Varoquaux</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Gramfort</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Michel</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Thirion</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Grisel</surname>
              <given-names>O</given-names>
            </name>
            <name name-style="western">
              <surname>Blondel</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Prettenhofer</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Weiss</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Dubourg</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Vanderplas</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Passos</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Cournapeau</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Brucher</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Perrot</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Duchesnay</surname>
              <given-names>E</given-names>
            </name>
          </person-group>
          <article-title>Scikit-learn: machine learning in python</article-title>
          <source>J Mach Learn Res</source>
          <year>2011</year>
          <volume>12</volume>
          <issue>10</issue>
          <fpage>2825</fpage>
          <lpage>30</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.jmlr.org/papers/v12/pedregosa11a.html"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref38">
        <label>38</label>
        <nlm-citation citation-type="web">
          <article-title>CrowdSourcing-for-Digital-Public-Health-Surveillance</article-title>
          <source>GitHub</source>
          <access-date>2021-12-16</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://github.com/data-intelligence-for-health-lab/CrowdSourcing-for-Digital-Public-Health-Surveillance">https://github.com/data-intelligence-for-health-lab/CrowdSourcing-for-Digital-Public-Health-Surveillance</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref39">
        <label>39</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lewis</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Gale</surname>
              <given-names>W</given-names>
            </name>
          </person-group>
          <source>A Sequential Algorithm for Training Text Classifiers</source>
          <year>1994</year>
          <publisher-loc>London, UK</publisher-loc>
          <publisher-name>Springer</publisher-name>
          <fpage>3</fpage>
          <lpage>12</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref40">
        <label>40</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Laws</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Scheible</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Schütze</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>Active learning with Amazon Mechanical Turk</article-title>
          <source>Proceedings of the 2011 Conference on Empirical Methods in Natural Language Processing</source>
          <year>2011</year>
          <conf-name>Conference on Empirical Methods in Natural Language Processing</conf-name>
          <conf-date>July 2011</conf-date>
          <conf-loc>Edinburgh, Scotland, UK</conf-loc>
          <fpage>1546</fpage>
          <lpage>56</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://aclanthology.org/D11-1143/"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref41">
        <label>41</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lease</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>On quality control and machine learning in crowdsourcing</article-title>
          <source>Hum Comput</source>
          <year>2011</year>
          <volume>11</volume>
          <issue>11</issue>
          <fpage>1</fpage>
          <lpage>6</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.670.5262&#38;rep=rep1&#38;type=pdf"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref42">
        <label>42</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Devlin</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Chang</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Toutanova</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>Bert: pre-training of deep bidirectional transformers for language understanding</article-title>
          <source>arXiv</source>
          <year>2018</year>
          <access-date>2021-12-11</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/1810.04805">https://arxiv.org/abs/1810.04805</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref43">
        <label>43</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lundberg</surname>
              <given-names>SM</given-names>
            </name>
            <name name-style="western">
              <surname>Erion</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>DeGrave</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Prutkin</surname>
              <given-names>JM</given-names>
            </name>
            <name name-style="western">
              <surname>Nair</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Katz</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Himmelfarb</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Bansal</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>From local explanations to global understanding with explainable ai for trees</article-title>
          <source>Nat Mach Intell</source>
          <year>2020</year>
          <month>01</month>
          <day>17</day>
          <volume>2</volume>
          <issue>1</issue>
          <fpage>56</fpage>
          <lpage>67</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/32607472"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/s42256-019-0138-9</pub-id>
          <pub-id pub-id-type="medline">32607472</pub-id>
          <pub-id pub-id-type="pmcid">PMC7326367</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref44">
        <label>44</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Caspersen</surname>
              <given-names>CJ</given-names>
            </name>
            <name name-style="western">
              <surname>Powell</surname>
              <given-names>KE</given-names>
            </name>
            <name name-style="western">
              <surname>Christenson</surname>
              <given-names>GM</given-names>
            </name>
          </person-group>
          <article-title>Physical activity, exercise, and physical fitness: definitions and distinctions for health-related research</article-title>
          <source>Public Health Rep</source>
          <year>1985</year>
          <volume>100</volume>
          <issue>2</issue>
          <fpage>126</fpage>
          <lpage>31</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/3920711"/>
          </comment>
          <pub-id pub-id-type="medline">3920711</pub-id>
          <pub-id pub-id-type="pmcid">PMC1424733</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref45">
        <label>45</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Miller</surname>
              <given-names>GA</given-names>
            </name>
          </person-group>
          <article-title>WordNet: a lexical database for English</article-title>
          <source>Commun ACM</source>
          <year>1995</year>
          <month>11</month>
          <volume>38</volume>
          <issue>11</issue>
          <fpage>39</fpage>
          <lpage>41</lpage>
          <pub-id pub-id-type="doi">10.1145/219717.219748</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref46">
        <label>46</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Difallah</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Filatova</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Ipeirotis</surname>
              <given-names>P</given-names>
            </name>
          </person-group>
          <article-title>Demographics and dynamics of mechanical Turk workers</article-title>
          <source>Proceedings of the Eleventh ACM International Conference on Web Search and Data Mining</source>
          <year>2018</year>
          <conf-name>WSDM 2018: The Eleventh ACM International Conference on Web Search and Data Mining</conf-name>
          <conf-date>February 5 - 9, 2018</conf-date>
          <conf-loc>Marina Del Rey CA USA</conf-loc>
          <fpage>135</fpage>
          <lpage>43</lpage>
          <pub-id pub-id-type="doi">10.1145/3159652.3159661</pub-id>
        </nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
