<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article" dtd-version="2.0">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JMIR</journal-id>
      <journal-id journal-id-type="nlm-ta">J Med Internet Res</journal-id>
      <journal-title>Journal of Medical Internet Research</journal-title>
      <issn pub-type="epub">1438-8871</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
      <article-id pub-id-type="publisher-id">v22i7e14337</article-id>
      <article-id pub-id-type="pmid">32437327</article-id>
      <article-id pub-id-type="doi">10.2196/14337</article-id>
      <article-categories>
        <subj-group subj-group-type="heading">
          <subject>Original Paper</subject>
        </subj-group>
        <subj-group subj-group-type="article-type">
          <subject>Original Paper</subject>
        </subj-group>
      </article-categories>
      <title-group>
        <article-title>Surveilling Influenza Incidence With Centers for Disease Control and Prevention Web Traffic Data: Demonstration Using a Novel Dataset</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="editor">
          <name>
            <surname>Eysenbach</surname>
            <given-names>Gunther</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Biggerstaff</surname>
            <given-names>Matthew</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Murray</surname>
            <given-names>Erin</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Lau</surname>
            <given-names>Eric</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib id="contrib1" contrib-type="author" corresp="yes">
          <name name-style="western">
            <surname>Caldwell</surname>
            <given-names>Wendy K</given-names>
          </name>
          <degrees>BS, MA, PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <address>
            <institution>X Computational Physics Division</institution>
            <institution>Los Alamos National Laboratory</institution>
            <addr-line>P.O. Box 1663</addr-line>
            <addr-line>Mail Stop T086</addr-line>
            <addr-line>Los Alamos, NM, 87545</addr-line>
            <country>United States</country>
            <phone>1 5056678593</phone>
            <email>wkcaldwell@lanl.gov</email>
          </address>
          <xref rid="aff2" ref-type="aff">2</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-6076-5636</ext-link>
        </contrib>
        <contrib id="contrib2" contrib-type="author">
          <name name-style="western">
            <surname>Fairchild</surname>
            <given-names>Geoffrey</given-names>
          </name>
          <degrees>BS, MS, PhD</degrees>
          <xref rid="aff3" ref-type="aff">3</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-5500-8120</ext-link>
        </contrib>
        <contrib id="contrib3" contrib-type="author">
          <name name-style="western">
            <surname>Del Valle</surname>
            <given-names>Sara Y</given-names>
          </name>
          <degrees>BS, MS, PhD</degrees>
          <xref rid="aff3" ref-type="aff">3</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-0159-1952</ext-link>
        </contrib>
      </contrib-group>
      <aff id="aff1">
        <label>1</label>
        <institution>X Computational Physics Division</institution>
        <institution>Los Alamos National Laboratory</institution>
        <addr-line>Los Alamos, NM</addr-line>
        <country>United States</country>
      </aff>
      <aff id="aff2">
        <label>2</label>
        <institution>School of Mathematical and Statistical Sciences</institution>
        <institution>Arizona State University</institution>
        <addr-line>Tempe, AZ</addr-line>
        <country>United States</country>
      </aff>
      <aff id="aff3">
        <label>3</label>
        <institution>Analytics, Intelligence, and Technology Division</institution>
        <institution>Los Alamos National Laboratory</institution>
        <addr-line>Los Alamos, NM</addr-line>
        <country>United States</country>
      </aff>
      <author-notes>
        <corresp>Corresponding Author: Wendy K Caldwell <email>wkcaldwell@lanl.gov</email></corresp>
      </author-notes>
      <pub-date pub-type="collection">
        <month>7</month>
        <year>2020</year>
      </pub-date>
      <pub-date pub-type="epub">
        <day>3</day>
        <month>7</month>
        <year>2020</year>
      </pub-date>
      <volume>22</volume>
      <issue>7</issue>
      <elocation-id>e14337</elocation-id>
      <history>
        <date date-type="received">
          <day>10</day>
          <month>4</month>
          <year>2019</year>
        </date>
        <date date-type="rev-request">
          <day>10</day>
          <month>9</month>
          <year>2019</year>
        </date>
        <date date-type="rev-recd">
          <day>29</day>
          <month>1</month>
          <year>2020</year>
        </date>
        <date date-type="accepted">
          <day>22</day>
          <month>3</month>
          <year>2020</year>
        </date>
      </history>
      <copyright-statement>©Wendy K Caldwell, Geoffrey Fairchild, Sara Y Del Valle. Originally published in the Journal of Medical Internet Research (http://www.jmir.org), 03.07.2020.</copyright-statement>
      <copyright-year>2020</copyright-year>
      <license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
        <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (https://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in the Journal of Medical Internet Research, is properly cited. The complete bibliographic information, a link to the original publication on http://www.jmir.org/, as well as this copyright and license information must be included.</p>
      </license>
      <self-uri xlink:href="https://www.jmir.org/2020/7/e14337" xlink:type="simple"/>
      <abstract>
        <sec sec-type="background">
          <title>Background</title>
          <p>Influenza epidemics result in a public health and economic burden worldwide. Traditional surveillance techniques, which rely on doctor visits, provide data with a delay of 1 to 2 weeks. A means of obtaining real-time data and forecasting future outbreaks is desirable to provide more timely responses to influenza epidemics.</p>
        </sec>
        <sec sec-type="objective">
          <title>Objective</title>
          <p>This study aimed to present the first implementation of a novel dataset by demonstrating its ability to supplement traditional disease surveillance at multiple spatial resolutions.</p>
        </sec>
        <sec sec-type="methods">
          <title>Methods</title>
          <p>We used internet traffic data from the Centers for Disease Control and Prevention (CDC) website to determine the potential usability of this data source. We tested the traffic generated by 10 influenza-related pages in 8 states and 9 census divisions within the United States and compared it against clinical surveillance data.</p>
        </sec>
        <sec sec-type="results">
          <title>Results</title>
          <p>Our results yielded an <italic>r</italic><sup>2</sup> value of 0.955 in the most successful case, promising results for some cases, and unsuccessful results for other cases. In the interest of scientific transparency to further the understanding of when internet data streams are an appropriate supplemental data source, we also included negative results (ie, unsuccessful models). Models that focused on a single influenza season were more successful than those that attempted to model multiple influenza seasons. Geographic resolution appeared to play a key role, with national and regional models being more successful, overall, than models at the state level.</p>
        </sec>
        <sec sec-type="conclusions">
          <title>Conclusions</title>
          <p>These results demonstrate that internet data may be able to complement traditional influenza surveillance in some cases but not in others. Specifically, our results show that the CDC website traffic may inform national- and division-level models but not models for each individual state. In addition, our results show better agreement when the data were broken up by seasons instead of aggregated over several years. We anticipate that this work will lead to more complex nowcasting and forecasting models using this data stream.</p>
        </sec>
      </abstract>
      <kwd-group>
        <kwd>influenza</kwd>
        <kwd>surveillance</kwd>
        <kwd>infoveillance</kwd>
        <kwd>infodemiology</kwd>
        <kwd>projections and predictions</kwd>
        <kwd>internet</kwd>
        <kwd>data sources</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <sec sec-type="introduction">
      <title>Introduction</title>
      <sec>
        <title>Background and Motivation</title>
        <p>Every year, an estimated 5% to 20% of people in the United States become infected with influenza [<xref ref-type="bibr" rid="ref1">1</xref>]. The typical influenza season begins in October and ends in May, with the peak occurring in the winter months. Annually, 3000 to 50,000 people die from the flu, with another 200,000 people requiring hospitalization [<xref ref-type="bibr" rid="ref2">2</xref>]. The yearly flu burden is estimated to cost around US $11.2 billion in lost productivity, with some estimates as high as US $87 billion [<xref ref-type="bibr" rid="ref2">2</xref>,<xref ref-type="bibr" rid="ref3">3</xref>]. Timely surveillance of influenza can help reduce this burden, allowing health care facilities to more adequately prepare for the influx of patients when flu levels are high [<xref ref-type="bibr" rid="ref4">4</xref>].</p>
        <p>One common surveillance measure is the fraction of patients presenting with influenza-like illness (ILI), consisting of a fever of at least 100°F (37.8°C) and a cough or sore throat with no other known cause [<xref ref-type="bibr" rid="ref5">5</xref>]. ILI data are collected from about 2900 volunteer health care providers throughout the United States, although each week, only about 1800 of them report their data. These data are then aggregated and made public after a time lag of about 1 to 2 weeks [<xref ref-type="bibr" rid="ref1">1</xref>,<xref ref-type="bibr" rid="ref6">6</xref>-<xref ref-type="bibr" rid="ref10">10</xref>]. As the ILI data are collected from volunteer health care providers, the dataset is incomplete. If policies were enacted to provide incentives for health care providers who report these data or to make reporting compulsory, the result would be a more complete dataset. Other surveillance systems include virological data from the World Health Organization, emergency department visits, electronic health records, crowd-sourced ILI reports, Widely Internet Sourced Distributed Monitoring, Influenzanet, and Flu Near You [<xref ref-type="bibr" rid="ref11">11</xref>,<xref ref-type="bibr" rid="ref12">12</xref>].</p>
      </sec>
      <sec>
        <title>Internet Data Streams</title>
        <p>In the United States, 87% [<xref ref-type="bibr" rid="ref13">13</xref>] of adults use the internet. Of those internet users, 72% [<xref ref-type="bibr" rid="ref13">13</xref>] have used the internet to search for health information within the last year. The most common health-related searches are for information regarding a specific disease or condition (66%) and information about a specific treatment or procedure (56%) [<xref ref-type="bibr" rid="ref13">13</xref>,<xref ref-type="bibr" rid="ref14">14</xref>].</p>
        <p>There are two main types of health-related internet activity. The first is health sharing, in which internet users post about health-related topics (eg, a tweet about being sick). The second is health seeking, in which users use the internet to obtain information about health-related topics [<xref ref-type="bibr" rid="ref6">6</xref>]. In this paper, we focused on health-seeking behavior. Previous studies have shown that analyzing web-based health-seeking behavior can improve early detection of disease incidence by detecting changes in disease activity [<xref ref-type="bibr" rid="ref9">9</xref>,<xref ref-type="bibr" rid="ref15">15</xref>-<xref ref-type="bibr" rid="ref19">19</xref>]. Similarly, other studies have shown that internet data emerging from search queries can aid detection of outbreaks in areas with large populations of internet users [<xref ref-type="bibr" rid="ref20">20</xref>] because web-based health-related search queries and epidemics are often strongly correlated [<xref ref-type="bibr" rid="ref20">20</xref>,<xref ref-type="bibr" rid="ref21">21</xref>].</p>
        <p>Internet data have been used to forecast disease incidence in other models. Polgreen et al [<xref ref-type="bibr" rid="ref9">9</xref>] developed linear influenza forecasting models with lags of 1 to 10 weeks for each of the 9 US census regions using search queries from Yahoo [<xref ref-type="bibr" rid="ref9">9</xref>]. The best performing models had lags of 1 to 3 weeks and an average <italic>r<sup>2</sup></italic> value of 0.38 (with a high of 0.57 in the East South Central region) [<xref ref-type="bibr" rid="ref9">9</xref>]. These low <italic>r<sup>2</sup></italic> values demonstrate potential problems in relying on search information alone. Ginsberg et al [<xref ref-type="bibr" rid="ref15">15</xref>] were able to predict influenza epidemics 2 weeks in advance using Google search queries to fit linear models using log-odds of ILI visits and related searches.</p>
        <p>Using a Poisson distribution and Lasso regression, McIver and Brownstein [<xref ref-type="bibr" rid="ref8">8</xref>] obtained an <italic>r<sup>2</sup></italic> value of 0.946 using Wikipedia data, although some data were excluded from analyses because of increased media attention and higher than normal influenza activity. Generous et al [<xref ref-type="bibr" rid="ref7">7</xref>] used Wikipedia data to train a statistical model with linear regression, which demonstrated its potential for forecasting disease incidence worldwide, including influenza in the United States, which had an <italic>r<sup>2</sup></italic> value of 0.89. Hickmann et al [<xref ref-type="bibr" rid="ref1">1</xref>] conducted a similar study of linear regression models, which showed that using Wikipedia to forecast influenza in the United States for the 2013 to 2014 season resulted in an <italic>r<sup>2</sup></italic> value greater than 0.9 in some instances.</p>
        <p>Integrating both Wikipedia data and Google Flu Trends, Bardak et al [<xref ref-type="bibr" rid="ref22">22</xref>] obtained <italic>r<sup>2</sup></italic> values of 0.94 and 0.91 using ordinary least squares (OLS) and ridge regression, respectively, for forecasting influenza outbreaks. For OLS nowcasting, the <italic>r<sup>2</sup></italic> value was 0.98 in the best case. For the best fit, the weekly data were offset by 1 week, so that ILI visits were correlated with internet data from the prior week [<xref ref-type="bibr" rid="ref22">22</xref>].</p>
        <p>As part of the Centers for Disease Control and Prevention (CDC)’s 2013 to 2014 Predict the Influenza Season Challenge, 9 teams used digital data sources to create forecasting models. The digital sources these teams used were Wikipedia, Twitter, Google Flu Trends, and HealthMap. The teams used either mechanistic or statistical models to create their forecasts, with the most successful team using multiple data sources, which may have reduced biases usually associated with internet data streams [<xref ref-type="bibr" rid="ref23">23</xref>]. Broniatowski et al [<xref ref-type="bibr" rid="ref24">24</xref>] used Twitter data to detect increasing and decreasing influenza prevalence with 85% accuracy. Zhang et al [<xref ref-type="bibr" rid="ref25">25</xref>] used Twitter data to inform stochastically, spatially structured mechanistic models of influenza in the United States, Italy, and Spain.</p>
        <p>Internet data streams have also been used to supplement traditional surveillance techniques with nowcasting models. Paul et al [<xref ref-type="bibr" rid="ref26">26</xref>] used Twitter along with ILI data from the CDC to produce nowcasting influenza models as well as nowcasting models using solely ILI data. They concluded that the addition of Twitter data led to more accurate nowcasting models. Santillana et al [<xref ref-type="bibr" rid="ref27">27</xref>] combined Google Trends data and CDC-reported ILI data to create models for nowcasting and forecasting influenza. Lampos et al [<xref ref-type="bibr" rid="ref28">28</xref>] used search query data to explore both linear and nonlinear nowcasting models. Yang et al [<xref ref-type="bibr" rid="ref29">29</xref>] used Google search data to create an influenza tracking model with autoregression.</p>
        <p>In contrast, we considered data on page views of the CDC website rather than search data from sites not solely devoted to public health. We used this dataset because we expect it to be inherently less noisy because of its focus on public health issues. We used OLS to nowcast influenza nationally, across the 9 US census divisions, and across 8 states using access data from 10 influenza-related CDC pages. Our nowcasting models cover influenza seasons from 2013 to 2016, with the 2012 to 2013 season being partially included because the CDC page view dataset begins on January 1, 2013. The inclusion of an incomplete influenza season serves to inform if this dataset can be used given a more restrictive time frame. We included both positive and negative results to advance our knowledge regarding when internet data may or may not work. The negative results are crucial to advancing the field of disease surveillance using internet data, as they demonstrate when these data sources contribute to unreliable surveillance. We focus on answering the following two research questions: (1) Can CDC page visits be used as an additional data source for monitoring disease incidence? and (2) What is the appropriate time shift of the page view data needed to obtain the best data fit?</p>
      </sec>
    </sec>
    <sec sec-type="methods">
      <title>Methods</title>
      <sec>
        <title>Data Sources</title>
        <p>We used page view data provided by the CDC. Each data point contains the page name, date and time of access, and the geographic location from where the page was viewed. These data are available at geographic resolutions of national and state levels and include some metropolitan areas (eg, New York City). The data are available at a number of temporal resolutions beginning on January 1, 2013. For these models, we used weekly page view data to coincide with the ILI data temporal resolution. The data are available as raw page view counts and page view counts normalized with respect to all CDC page views, and we considered the latter for this work. We selected pages associated with general influenza information, treatment, and diagnosis. Pages were sometimes renamed, but we were able to follow the evolution of each selected page by using keywords in the page titles as well as the date ranges for available data.</p>
        <p>As the majority of health-related internet searches concern specific conditions, treatments, and procedures [<xref ref-type="bibr" rid="ref14">14</xref>], we selected pages related to those topics. These pages also align with the study by Johnson et al [<xref ref-type="bibr" rid="ref30">30</xref>], who used pages in the categories of diagnosis and treatment as well as prevention and vaccination for influenza surveillance [<xref ref-type="bibr" rid="ref30">30</xref>]. Specifically, we used the following pages: antivirals, flu basics, FluView, high risk complications, key facts, prevention, symptoms, treating influenza, treatment, and vaccine. We then aggregated the page views of interest for each of our models. FluView has the potential to be an outlier page, especially when used alone, as this page tracks the severity of the influenza season and could have higher page views as a result of media attention and severe influenza seasons. However, when combined with other pages focused on treatment and prevention, we expected these page view data to be useful for our models. A complete list of pages can be found in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p>
        <p>The states we selected were based on the severity of flu (determined from FluView) during the available seasons and the availability of ILI data at the time of the study, which is not standardized and is dependent on each state’s reporting mechanism. ILI data for each state include the week ending or starting date and the percentage of ILI for the specified week. Although some states also report additional data, such as school closures and hospitalizations, these data are not made available by every state. Note that ILI reporting and accessibility vary across all states. The states we selected were (1) California, (2) Maine, (3) Missouri, (4) New Jersey, (5) New Mexico, (6) North Carolina, (7) Texas, and (8) Wisconsin. With the exception of Texas, these states did not release ILI data outside of the typical flu season. As the purpose of this study was to demonstrate the viability of nowcasting, we considered only those ILI data available during the study period. Although some states have made their ILI more accessible since the end of the study, we did not consider these data, as they were not available during the study period. The exclusion of additional data not available during the study period helps to preserve the premise of nowcasting by focusing only on data sources available during the study period. Likewise, our state ILI data often came from the state’s individual weekly reports during the seasons used in the study. A complete list of the data sources for the state ILI can be found in <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>, and the clinical data are available in <xref ref-type="supplementary-material" rid="app3">Multimedia Appendices 3</xref> and <xref ref-type="supplementary-material" rid="app4">4</xref>.</p>
        <p><xref rid="figure1" ref-type="fig">Figure 1</xref> shows the percentage of ILI visits for each state considered in this study and the national percentage of ILI visits. We see distinct spikes that indicate the peaks of the flu seasons. With the exception of Maine, which behaves as an outlier at times, the figure shows spikes indicating there are peak weeks for influenza-related page views. Texas also exhibits outlier behavior with ILI percentages consistently higher than the typical national baseline of 2%, which is used to determine when the flu has reached epidemic status. These 2 outliers are shown in teal (Texas) and dark blue (Maine). The national ILI is shown in black. The remaining states exhibit behavior consistent with the national ILI trend. <xref rid="figure2" ref-type="fig">Figure 2</xref> shows the CDC page view data as a heat map: weeks with more page views are shown darker than weeks with fewer page views.</p>
        <fig id="figure1" position="float">
          <label>Figure 1</label>
          <caption>
            <p>Percentage of ILI visits per state compared with the typical national baseline of 2%. Maine (dark blue) and Texas (teal) exhibit outlier behavior, with Texas having a greater ILI percentage and Maine having a lesser ILI percentage. The remaining states follow the national ILI trend, shown in black. ILI: influenza-like illness.</p>
          </caption>
          <graphic xlink:href="jmir_v22i7e14337_fig1.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <fig id="figure2" position="float">
          <label>Figure 2</label>
          <caption>
            <p>Normalized CDC web traffic as a heat map. Darker areas indicate more page views and appear to correlate with increases in influenza-like illness. The page views also appear to be more prevalent during the typical influenza season, October to May. CDC: Centers for Disease Control and Prevention.</p>
          </caption>
          <graphic xlink:href="jmir_v22i7e14337_fig2.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <p>In addition to selected states, we also considered the 9 US census divisions: New England, Middle Atlantic, East North Central, West North Central, South Atlantic, East South Central, West South Central, Mountain, and Pacific. <xref ref-type="supplementary-material" rid="app5">Multimedia Appendix 5</xref> provides a list of states included in each division. Data for the census divisions were obtained from the CDC and are presented in <xref ref-type="supplementary-material" rid="app3">Multimedia Appendix 3</xref>.</p>
      </sec>
      <sec>
        <title>Linear Regression</title>
        <p>We used statsmodels version 0.9.0 [<xref ref-type="bibr" rid="ref31">31</xref>], a statistical analysis module for Python, to perform linear regression on our datasets using OLS. This creates a linear model <italic>M</italic>, the summation of regression coefficients multiplied by page view data. <xref rid="figure3" ref-type="fig">Figure 3</xref> shows the mathematical formula of <italic>M</italic>, where <inline-graphic xlink:href="jmir_v22i7e14337_fig7.png" xlink:type="simple" mimetype="image"/> are the regression coefficients and <italic>X</italic>=1, <italic>X</italic><sub>1</sub>, ... <italic>X</italic><sub>n</sub> is the vector of CDC page view data, with <italic>n</italic> representing the number of CDC pages used for the model, ranging from 1 to 10. <italic>M</italic> is a value between 0 and 1, representing the fraction of ILI visits. To plot the models and data on the same axes, we normalized <italic>M</italic> for visualization purposes, with <italic>M=1</italic> corresponding to the ILI percentage during the peak week of the influenza season. We correlated ILI and CDC page views for the same week or with a 1-week shift. In the shifted cases, we shifted the ILI data forward by 1 week, so that the model associates the current week’s page views with the following week’s ILI data. This shift is performed to account for the incubation period of influenza and the time between the onset of symptoms and the first doctor visit. Statsmodels [<xref ref-type="bibr" rid="ref31">31</xref>] uses the CDC page view and ILI data to determine the appropriate regression coefficients; fits parameters with OLS; and computes the goodness-of-fit, <italic>r</italic><sup>2</sup>, also referred to as the coefficient of determination. The <italic>r</italic><sup>2</sup> value measures how well 2 time series correlate. An <italic>r</italic><sup>2</sup> value of 1 indicates a perfect fit, whereas an <italic>r</italic><sup>2</sup> value of 0 indicates no correlation. Although <italic>r</italic><sup>2</sup> is not necessarily the best metric to judge goodness-of-fit [<xref ref-type="bibr" rid="ref6">6</xref>], it is nonetheless the most common metric used and still provides one with a decent overall sense of fit quality. In addition, we examined the root mean square error (RMSE) and the normalized root mean square error (NRMSE) using Python scikit-learn libraries. The RMSE and NRMSE metrics measure how the model prediction differs from the actual data, with the NRMSE normalized so that the greatest possible value is 1. For these metrics, lower numbers indicate a better fit.</p>
        <fig id="figure3" position="float">
          <label>Figure 3</label>
          <caption>
            <p>Mathematical formula of the linear ILI models created in this study. The model M represents the fraction of ILI visits, where &#60;inline-graphic xlink:href="jmir_v22i6e14337_fig7.png" mimetype="image" xlink:type="simple"/&#62; are the regression coefficients and <italic>X</italic>=1, <italic>X</italic><sub>1</sub>, ... <italic>X</italic><sub>n</sub> is the vector of CDC page view data, with n representing the number of CDC pages used for the model, ranging from 1 to 10. ILI: influenza-like illness; CDC: Centers for Disease Control and Prevention.</p>
          </caption>
          <graphic xlink:href="jmir_v22i7e14337_fig3.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
    </sec>
    <sec sec-type="results">
      <title>Results</title>
      <sec>
        <title>Format of Results</title>
        <p>We analyzed the data at the national, division, and state levels and computed <italic>r</italic><sup>2</sup> for each geographic resolution. In this section, we discuss the results of our experiments, both successes and failures. We include figures of models at the national, census division, and state levels. Owing to the varying scales between page views and ILI percent, we chose to normalize the data and our models to plot them on the same axes. Raw data were used to create the models and then each model was normalized with respect to its maximum. We also normalized the ILI data and CDC web traffic data with respect to their maximums for the given period so that all 3 curves may appear in the same plot. Additional model successes and failures not discussed here can be found in <xref ref-type="supplementary-material" rid="app6">Multimedia Appendix 6</xref>.</p>
      </sec>
      <sec>
        <title>National Results</title>
        <p>We selected pages that corresponded to the topics most often searched during web-based health-seeking activities. Aggregating all 10 pages in a single model, we were able to achieve an <italic>r<sup>2</sup></italic> value of 0.889 for the national 2012 to 2013 influenza season after implementing a 1-week shift. We also succeeded in modeling the national 2015 to 2016 influenza season with no shift, achieving an <italic>r<sup>2</sup></italic> value of 0.834. We obtained better results when limiting the pages to FluView, Symptoms, and Treatment, which we attribute to the information on these pages aligning with topics most commonly used for internet health seeking. For these pages, the most successful models did not have a shift. For the 2012 to 2013 influenza season, we achieved an <italic>r<sup>2</sup></italic> value of 0.906. The model for the 2015 to 2016 season had an <italic>r<sup>2</sup></italic> value of 0.891. <xref ref-type="table" rid="table1">Table 1</xref> shows the most successful model for each influenza season included in this study. <xref rid="figure4" ref-type="fig">Figure 4</xref> shows these models, with each figure caption indicating which page(s) comprise CDC web traffic, which appears in each figure and are the data used in the model.</p>
        <table-wrap position="float" id="table1">
          <label>Table 1</label>
          <caption>
            <p>Pages and shifts for the most successful models for each influenza season at the national level.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="280"/>
            <col width="110"/>
            <col width="70"/>
            <col width="70"/>
            <col width="190"/>
            <col width="280"/>
            <thead>
              <tr valign="bottom">
                <td>Pages used in model</td>
                <td>Season</td>
                <td>Shift</td>
                <td>
                  <italic>r<sup>2</sup></italic>
                </td>
                <td>Root mean square error</td>
                <td>Normalized root mean square error</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>FluView, Symptoms, and Treatment</td>
                <td>2012-2013</td>
                <td>None</td>
                <td>0.912</td>
                <td>0.423</td>
                <td>0.070</td>
              </tr>
              <tr valign="top">
                <td>Symptoms</td>
                <td>2015-2016</td>
                <td>None</td>
                <td>0.892</td>
                <td>0.213</td>
                <td>0.060</td>
              </tr>
              <tr valign="top">
                <td>FluView</td>
                <td>2013-2014</td>
                <td>None</td>
                <td>0.802</td>
                <td>0.510</td>
                <td>0.111</td>
              </tr>
              <tr valign="top">
                <td>Antivirals and Prevention</td>
                <td>2014-2015</td>
                <td>None</td>
                <td>0.778</td>
                <td>0.615</td>
                <td>0.103</td>
              </tr>
            </tbody>
          </table>
        </table-wrap>
        <fig id="figure4" position="float">
          <label>Figure 4</label>
          <caption>
            <p>These plots show national models and the associated pages and influenza seasons. (A) FluView, Symptoms, and Treatment, 2012 to 2013. (B) Symptoms, 2015 to 2016. (C) FluView, 2013 to 2014. (D) Antivirals and Prevention, 2014 to 2015. CDC: Centers for Disease Control and Prevention; ILI: influenza-like illness.</p>
          </caption>
          <graphic xlink:href="jmir_v22i7e14337_fig4.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Census Division Results</title>
        <p>Using the data for each of the 9 census divisions, we were able to achieve an <italic>r<sup>2</sup></italic> value greater than 0.7 for at least one case for each division. We considered all seasons together and separately, with the better results obtained from modeling each individual season. We considered all 10 pages together as well as combinations of one or more of these pages. In the most successful case, the model was able to closely match the 2015 to 2016 influenza season for the West North Central division, with an <italic>r<sup>2</sup></italic> value of 0.955 using the FluView, Symptoms, and Treatment pages. Although we had successes using all 10 pages, the most successful model for each division involved only these 3 pages. <xref rid="figure5" ref-type="fig">Figure 5</xref> shows some of these models, and <xref ref-type="table" rid="table2">Table 2</xref> highlights these successes.</p>
        <fig id="figure5" position="float">
          <label>Figure 5</label>
          <caption>
            <p>Census division model successes using the FluView, Symptoms, and Treatment pages for the 2012 to 2013 influenza season. (A) West North Central, 2012 to 2013. (B) Mountain, 2012 to 2013. (C) East North Central, 1-week shift, 2012 to 2013. (D) Pacific, 2012 to 2013. (E) West South Central 2012 to 2013. These plots represent the census division models that had the highest <italic>r<sup>2</sup></italic> value in the 2012 to 2013 influenza season. CDC: Centers for Disease Control and Prevention; ILI: influenza-like illness.</p>
          </caption>
          <graphic xlink:href="jmir_v22i7e14337_fig5.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <table-wrap position="float" id="table2">
          <label>Table 2</label>
          <caption>
            <p>The 9 census divisions and the season and shift for which the division’s model had the highest <italic>r<sup>2</sup></italic> value. The table also shows the root mean square error and the normalized root mean square error. The results presented correspond to the FluView, Symptoms, and Treatment pages aggregated.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="190"/>
            <col width="110"/>
            <col width="80"/>
            <col width="70"/>
            <col width="220"/>
            <col width="330"/>
            <thead>
              <tr valign="bottom">
                <td>Division</td>
                <td>Season</td>
                <td>Shift</td>
                <td>
                  <italic>r</italic>
                  <sup>2</sup>
                </td>
                <td>Root mean square error</td>
                <td>Normalized root mean square error</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>West North Central</td>
                <td>2012-2013</td>
                <td>None</td>
                <td>0.955</td>
                <td>0.367</td>
                <td>0.057</td>
              </tr>
              <tr valign="top">
                <td>Mountain</td>
                <td>2012-2013</td>
                <td>None</td>
                <td>0.921</td>
                <td>0.336</td>
                <td>0.077</td>
              </tr>
              <tr valign="top">
                <td>New England</td>
                <td>2015-2016</td>
                <td>None</td>
                <td>0.920</td>
                <td>0.096</td>
                <td>0.096</td>
              </tr>
              <tr valign="top">
                <td>East North Central</td>
                <td>2012-2013</td>
                <td>1 week</td>
                <td>0.899</td>
                <td>0.331</td>
                <td>0.076</td>
              </tr>
              <tr valign="top">
                <td>South Atlantic</td>
                <td>2015-2016</td>
                <td>None</td>
                <td>0.893</td>
                <td>0.218</td>
                <td>0.065</td>
              </tr>
              <tr valign="top">
                <td>Middle Atlantic</td>
                <td>2015-2016</td>
                <td>None</td>
                <td>0.861</td>
                <td>0.302</td>
                <td>0.073</td>
              </tr>
              <tr valign="top">
                <td>Pacific</td>
                <td>2012-2013</td>
                <td>None</td>
                <td>0.849</td>
                <td>0.503</td>
                <td>0.094</td>
              </tr>
              <tr valign="top">
                <td>West South Central</td>
                <td>2012-2013</td>
                <td>None</td>
                <td>0.828</td>
                <td>0.986</td>
                <td>0.105</td>
              </tr>
              <tr valign="top">
                <td>East South Central</td>
                <td>2015-2016</td>
                <td>1 week</td>
                <td>0.793</td>
                <td>0.365</td>
                <td>0.082</td>
              </tr>
            </tbody>
          </table>
        </table-wrap>
      </sec>
      <sec>
        <title>State Results</title>
        <p>We found <italic>r<sup>2</sup></italic> for each of the states considered in this study, using a variety of pages and page combinations. <xref ref-type="table" rid="table3">Table 3</xref> lists the most successful models for each state, the season, the data shift, and the <italic>r<sup>2</sup></italic> value.</p>
        <table-wrap position="float" id="table3">
          <label>Table 3</label>
          <caption>
            <p>The most successful results for each state considered in this study.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="140"/>
            <col width="110"/>
            <col width="110"/>
            <col width="80"/>
            <col width="70"/>
            <col width="200"/>
            <col width="290"/>
            <thead>
              <tr valign="bottom">
                <td>State</td>
                <td>Page(s)</td>
                <td>Season</td>
                <td>Shift</td>
                <td>
                  <italic>r</italic>
                  <sup>2</sup>
                </td>
                <td>Root mean square error</td>
                <td>Normalized root mean square error</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>Texas</td>
                <td>All<sup>a</sup></td>
                <td>2012-2013</td>
                <td>1 week</td>
                <td>0.930</td>
                <td>0.667</td>
                <td>0.067</td>
              </tr>
              <tr valign="top">
                <td>Wisconsin</td>
                <td>FVST<sup>b</sup></td>
                <td>2012-2013</td>
                <td>None</td>
                <td>0.833</td>
                <td>0.533</td>
                <td>0.127</td>
              </tr>
              <tr valign="top">
                <td>New Jersey</td>
                <td>All</td>
                <td>2012-2013</td>
                <td>1 week</td>
                <td>0.832</td>
                <td>0.767</td>
                <td>0.117</td>
              </tr>
              <tr valign="top">
                <td>Missouri</td>
                <td>FVST</td>
                <td>2012-2013</td>
                <td>1 week</td>
                <td>0.823</td>
                <td>0.801</td>
                <td>0.127</td>
              </tr>
              <tr valign="top">
                <td>North Carolina</td>
                <td>FVST</td>
                <td>2015-2016</td>
                <td>1 week</td>
                <td>0.781</td>
                <td>0.455</td>
                <td>0.106</td>
              </tr>
              <tr valign="top">
                <td>New Mexico</td>
                <td>All</td>
                <td>2015-2016</td>
                <td>1 week</td>
                <td>0.771</td>
                <td>1.184</td>
                <td>0.197</td>
              </tr>
              <tr valign="top">
                <td>California</td>
                <td>FVST</td>
                <td>2012-2013</td>
                <td>1 week</td>
                <td>0.758</td>
                <td>0.777</td>
                <td>0.125</td>
              </tr>
              <tr valign="top">
                <td>Maine</td>
                <td>Antivirals</td>
                <td>2012-2013</td>
                <td>None</td>
                <td>0.662</td>
                <td>0.445</td>
                <td>0.171</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table3fn1">
              <p><sup>a</sup><italic>All</italic> refers to the aggregation of all 10 pages.</p>
            </fn>
            <fn id="table3fn2">
              <p><sup>b</sup><italic>FVST</italic> refers to the aggregation of the FluView, Symptoms, and Treatment pages.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <p><xref rid="figure6" ref-type="fig">Figure 6</xref> shows both successes and failures at the state level. Adding all the pages together, we were able to obtain <italic>r<sup>2</sup></italic> values of 0.930 and 0.801 for Texas and Wisconsin, respectively, for the 2012 to 2013 influenza season. For the 2013 to 2014 season, the highest <italic>r<sup>2</sup></italic> value was 0.187 for Wisconsin. For the 2014 to 2015 season, the highest <italic>r<sup>2</sup></italic>value was 0.322 for Missouri. For the 2015 to 2016 season, the highest <italic>r<sup>2</sup></italic> value was 0.647 for North Carolina.</p>
        <fig id="figure6" position="float">
          <label>Figure 6</label>
          <caption>
            <p>Different states during different seasons. (A) Texas, 1-week shift, 2012 to 2013. (B) Wisconsin, 2013 to 2014. (C) Missouri, 2014 to 2015. (D) North Carolina, 2015 to 2016. (E) Wisconsin 2012 to 2013. The <italic>r<sup>2</sup></italic> values of each of these models ranged from 0.187 to 0.930. These models aggregated all 10 pages, and the success varied by state. CDC: Centers for Disease Control and Prevention; ILI: influenza-like illness.</p>
          </caption>
          <graphic xlink:href="jmir_v22i7e14337_fig6.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <p>We were not surprised that Texas had the best fit. Texas was the only state we included that provided ILI data not only for the typical influenza season but also for the off-season. These additional data likely contributed to the success of the Texas models. In keeping with our nowcasting scenario, we only included data available during the study period. During that period, Texas was the only state that provided off-season ILI data. These data have since been made available from other states, but the availability was not present during the study. The lack of success we encountered in modeling Maine was also expected because of Maine’s outlier behavior in ILI, having values considerably lower and out of pattern with other states. The models in <xref rid="figure6" ref-type="fig">Figure 6</xref> included all 10 pages aggregated together. However, as indicated by the individual state results, this does not always lead to the best fit. Successful models often include a combination of select pages (such as FluView, Symptoms, and Treatment) but not an aggregation of all 10. Furthermore, aside from Texas, we did not have ILI data for the states outside of the typical flu season. Without these additional data, we are unable to determine how strongly the lower page views in the off-season correlate with off-season ILI.</p>
        <p>We then shifted the ILI data forward by 1 week. The regression analysis yielded 7 state/season combinations with <italic>r<sup>2</sup></italic> values greater than 0.7 (<xref ref-type="table" rid="table4">Table 4</xref>). The table also includes both the regular RMSE and NRMSE.</p>
        <table-wrap position="float" id="table4">
          <label>Table 4</label>
          <caption>
            <p>States with models that had an <italic>r<sup>2</sup></italic> value greater than 0.7 when aggregating all 10 pages and shifting the influenza-like illness data forward by 1 week. The regular and normalized root mean square errors are also displayed.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="170"/>
            <col width="130"/>
            <col width="80"/>
            <col width="250"/>
            <col width="370"/>
            <thead>
              <tr valign="bottom">
                <td>State</td>
                <td>Season</td>
                <td>
                  <italic>r</italic>
                  <sup>2</sup>
                </td>
                <td>Root mean square error</td>
                <td>Normalized root mean square error</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>Texas</td>
                <td>2012-2013</td>
                <td>0.930</td>
                <td>0.667</td>
                <td>0.067</td>
              </tr>
              <tr valign="top">
                <td>New Jersey</td>
                <td>2012-2013</td>
                <td>0.832</td>
                <td>0.767</td>
                <td>0.117</td>
              </tr>
              <tr valign="top">
                <td>New Mexico</td>
                <td>2015-2016</td>
                <td>0.771</td>
                <td>1.184</td>
                <td>0.197</td>
              </tr>
              <tr valign="top">
                <td>California</td>
                <td>2012-2013</td>
                <td>0.746</td>
                <td>0.797</td>
                <td>0.129</td>
              </tr>
              <tr valign="top">
                <td>Wisconsin</td>
                <td>2012-2013</td>
                <td>0.727</td>
                <td>0.626</td>
                <td>0.153</td>
              </tr>
              <tr valign="top">
                <td>North Carolina</td>
                <td>2015-2016</td>
                <td>0.708</td>
                <td>1.028</td>
                <td>0.204</td>
              </tr>
              <tr valign="top">
                <td>Missouri</td>
                <td>2012-2013</td>
                <td>0.702</td>
                <td>1.039</td>
                <td>0.165</td>
              </tr>
            </tbody>
          </table>
        </table-wrap>
        <p>Adding only the FluView, Symptoms, and Treatment pages, we obtained an <italic>r</italic><sup>2</sup> value of 0.7 or greater for 6 state/season combinations. For the 2013 to 2014 season, the highest <italic>r</italic><sup>2</sup> values were 0.612 for California and 0.568 for Wisconsin. Although this is still less than desired, it is a vast improvement from the <italic>r</italic><sup>2</sup> values found from adding all 10 pages. For the 2014 to 2015 season, the highest <italic>r</italic><sup>2</sup> was 0.575 for Missouri. Again, although the correlation appears to be weak, it is a stronger correlation than taking all 10 pages together. Using the same 3 pages and implementing a 1-week shift, we obtained an <italic>r</italic><sup>2</sup> value of 7 or greater for 10 state/season combinations. For the 2014 to 2015 season, the highest <italic>r</italic><sup>2</sup> value was 0.548 for Missouri.</p>
      </sec>
      <sec>
        <title>State Influenza-Like Illness Data Availability</title>
        <p>The purpose of this study was to demonstrate the viability of near real-time nowcasting during the influenza seasons from 2013 to 2016. To maintain the premise of nowcasting, we chose states with publicly available data, or data available on request, during the period of the study. During the study period, state ILI data were not readily available on the CDC website. Instead, we had to rely on data available through state health-related organizations for each state. In addition, throughout the course of influenza seasons, ILI numbers are often updated as delayed data are reported and made available. However, because we are focusing our study on a nowcasting scenario, we do not consider the ILI data from those seasons as they are reported today but rather as they were reported during the study period.</p>
      </sec>
      <sec>
        <title>Model Failures</title>
        <p>We generally found the models to be successful when considering pages most closely related to typical health-seeking behavior and when considering each flu season individually. When trying to model multiple influenza seasons together, we had a number of unsuccessful models. Considering all pages and national ILI data, the model combining the 2012 to 2013 and 2013 to 2014 influenza seasons had an <italic>r</italic><sup>2</sup> value of 0.061 and RMSE of 0.553. The combined 2013 to 2014 and 2014 to 2015 model had an <italic>r</italic><sup>2</sup> value of 0.241 and RMSE of 0.208. The combined 2014 to 2015 and 2015 to 2016 model had an <italic>r</italic><sup>2</sup> value of 0.251 and RMSE of 0.286. At the state level, combining all pages resulted in a number of unsuccessful models. For the 2013 to 2014 season, the Wisconsin model had an <italic>r</italic><sup>2</sup> value of 0.187 and RMSE of 0.523. For the 2014 to 2015 season, the Missouri model had an <italic>r</italic><sup>2</sup> value of 0.322 and RMSE of 1.845. Model failures not included in this section can be found in <xref ref-type="supplementary-material" rid="app6">Multimedia Appendix 6</xref>.</p>
        <p>We speculate that a number of factors could contribute to these negative results. Although influenza is a seasonal disease, similar strains can span multiple years, affecting the susceptible populations in subsequent years. Our data stream may be biased toward individuals with more awareness of the CDC. Furthermore, individuals who search for influenza information in one season may not search for that information the next year. Finally, with the exception of Texas, we only have ILI data for the influenza season itself. Thus, although we do have internet data for off-season influenza page views, we do not have corresponding ILI data.</p>
      </sec>
    </sec>
    <sec sec-type="discussion">
      <title>Discussion</title>
      <sec>
        <title>Conclusions</title>
        <p>Internet surveillance data have proven beneficial in predicting ILI incidence during flu seasons. However, our results show that the benefit of internet data streams on informing disease is inconclusive; that is, this study shows that the CDC website traffic can be informative in some cases (eg, national level) but not in others (eg, state level). To determine the extent, we must return to our original research questions.</p>
        <sec>
          <title>Research Question 1</title>
          <p>Given the successes of some of our models, we can conclude that CDC page view data can be used as an additional data source for monitoring disease incidence in some cases (eg, at the national level). The degree to which these data can be used appears to rely on the page selection and time frame. The results of the best models varied across geographic and temporal resolutions, but some trends were consistent in most cases. We obtained successful nowcasts when selecting pages related to topics most commonly used for web-based health queries (specific diseases and treatments) during the time span of a typical influenza season. Longer time spans and pages less associated with specific diseases and treatments led to less successful models. Outlier behavior, such as the ILI data in Maine, affected our models and resulted in less successful models than states with ILI curves exhibiting expected behavior. These results can assist others in selecting appropriate supplemental datasets for disease surveillance as well as appropriate spatial and temporal resolutions.</p>
        </sec>
        <sec>
          <title>Research Question 2</title>
          <p>We obtained our most successful results using a 1-week shift. Moreover, 2-week shifts were successful in some cases but were overall less correlated than 1-week shifts (<xref ref-type="supplementary-material" rid="app6">Multimedia Appendix 6</xref>). Using no shift at all proved successful in some cases but not in others. We surmise that the shift required for the best fit depends on the incubation period for the disease in question as well as the period of reporting. The CDC internet data are available daily; however, ILI data are available weekly, so we are limited in the types of shifts we can apply to the datasets. Another factor that could contribute to the need for a 1-week shift is the amount of time between the page view and the subsequent visit to a health care center. If there are one or more days between the page view and the visit, then these 2 events could occur during different weeks. Shifting the data by 1 week accounts for this behavior.</p>
        </sec>
      </sec>
      <sec>
        <title>Future Work</title>
        <p>We conclude that more studies on internet data streams are needed to understand when and why internet data work. Our methods are consistent with other feasibility studies and provide insight into the conditions under which internet data streams may inform influenza models. Future work should include rigorously testing the predictive power of the models by separating data into training and testing sets [<xref ref-type="bibr" rid="ref6">6</xref>].</p>
        <p>More studies on geographic resolution could provide a better insight into why some models outperform others at various spatial resolutions. National models across single influenza seasons performed well, with each season included in the study having at least one model with an <italic>r</italic><sup>2</sup> value greater than 0.75. We attribute the national model successes to the representation of all 50 states. Internet access may not be as prevalent in all states, but the inclusion of all 50 states allows for more data to be considered. Likewise, the census division models performed well, with overall <italic>r</italic><sup>2</sup> values greater than those achieved from national models. Each census division had at least one model in the study with an <italic>r</italic><sup>2</sup> value greater than 0.79. We attribute these successes to not only the inclusion of all states but also the division into geographic areas. There are instances in which a person may live in one state and seek medical care in another, perhaps because of working in a neighboring state. These instances are not accounted for by simply looking at states but can be accounted for by considering several neighboring states for 1 model. At the state level, models were overall less successful than at national and census division levels, but each state considered in the study had at least one model with an <italic>r</italic><sup>2</sup> value greater than 0.65, and all but Maine had models with an <italic>r</italic><sup>2</sup> value greater than 0.75. We attribute the overall lower success of state models to a combination of varying levels of internet access across populated and rural areas, the possibility of people living near neighboring states seeking health care in another state, and the inconsistencies in data availability during the study period. As our study focused on using data sources available during the study, we were limited in the states we could model because of the scarcity of the data.</p>
        <p>More studies on temporal resolution could provide a better insight into how best to model seasonal diseases over multiple seasons. Models across multiple seasons were not successful, which we attribute in part to the off-season ILI data being unavailable during the study period. As influenza is a seasonal disease, modeling multiple seasons with 1 model may not be the correct approach, and our multiseason models support this idea. However, more exhaustive studies are needed to draw definitive conclusions on the appropriate spatial resolution for modeling influenza.</p>
      </sec>
    </sec>
  </body>
  <back>
    <app-group>
      <supplementary-material id="app1">
        <label>Multimedia Appendix 1</label>
        <p>Names and date ranges of web pages used in this study.</p>
        <media xlink:href="jmir_v22i7e14337_app1.docx" xlink:title="DOCX File , 14 KB"/>
      </supplementary-material>
      <supplementary-material id="app2">
        <label>Multimedia Appendix 2</label>
        <p>Sources for state influenza-like illness data used in this study.</p>
        <media xlink:href="jmir_v22i7e14337_app2.docx" xlink:title="DOCX File , 15 KB"/>
      </supplementary-material>
      <supplementary-material id="app3">
        <label>Multimedia Appendix 3</label>
        <p>Influenza-like illness data for the nine census divisions.</p>
        <media xlink:href="jmir_v22i7e14337_app3.xls" xlink:title="XLS File  (Microsoft Excel File), 55 KB"/>
      </supplementary-material>
      <supplementary-material id="app4">
        <label>Multimedia Appendix 4</label>
        <p>State influenza-like illness data.</p>
        <media xlink:href="jmir_v22i7e14337_app4.xls" xlink:title="XLS File  (Microsoft Excel File), 33 KB"/>
      </supplementary-material>
      <supplementary-material id="app5">
        <label>Multimedia Appendix 5</label>
        <p>The nine US census divisions, listing all states in each division.</p>
        <media xlink:href="jmir_v22i7e14337_app5.docx" xlink:title="DOCX File , 15 KB"/>
      </supplementary-material>
      <supplementary-material id="app6">
        <label>Multimedia Appendix 6</label>
        <p>Comprehensive list of all models in this study not included in the main text. The list includes model successes and model failures.</p>
        <media xlink:href="jmir_v22i7e14337_app6.docx" xlink:title="DOCX File , 47 KB"/>
      </supplementary-material>
    </app-group>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">CDC</term>
          <def>
            <p>Centers for Disease Control and Prevention</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb2">ILI</term>
          <def>
            <p>influenza-like illness</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb3">NRMSE</term>
          <def>
            <p>normalized root mean square error</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb4">OLS</term>
          <def>
            <p>ordinary least squares</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb5">RMSE</term>
          <def>
            <p>root mean square error</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <ack>
      <p>This work was supported by the Los Alamos National Laboratory Information Science and Technology Institute and the National Institutes of Health, the National Institute of General Medical Sciences, and the Modeling for Infectious Disease Agent Study program under grant U01-GM097658-071. Matthew Biggerstaff of the CDC provided integral insight and support. Curt Canada and the Data Science at Scale Summer School were integral to this project. Stephen Wirkus, Abigail Hunter, and Catherine S Plesko provided recommendations and clarifications. Jonathan Woodring, David H Rogers, and Francesca Samsel provided technical and visual assistance. Los Alamos National Laboratory is operated by Triad National Security, Limited Liability Company, for the National Nuclear Security Administration for the US Department of Energy (contract number: 89233218NCA000001).</p>
    </ack>
    <fn-group>
      <fn fn-type="con">
        <p>WC, GF, and SD conceptualized the project and performed data analysis. GF performed data curation. WC wrote the manuscript. GF and SD edited the manuscript. WC created the visualizations.</p>
      </fn>
      <fn fn-type="conflict">
        <p>None declared.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hickmann</surname>
              <given-names>KS</given-names>
            </name>
            <name name-style="western">
              <surname>Fairchild</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Priedhorsky</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Generous</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Hyman</surname>
              <given-names>JM</given-names>
            </name>
            <name name-style="western">
              <surname>Deshpande</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>del Valle</surname>
              <given-names>SY</given-names>
            </name>
          </person-group>
          <article-title>Forecasting the 2013-2014 influenza season using Wikipedia</article-title>
          <source>PLoS Comput Biol</source>
          <year>2015</year>
          <month>05</month>
          <volume>11</volume>
          <issue>5</issue>
          <fpage>e1004239</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://dx.plos.org/10.1371/journal.pcbi.1004239"/>
          </comment>
          <pub-id pub-id-type="doi">10.1371/journal.pcbi.1004239</pub-id>
          <pub-id pub-id-type="medline">25974758</pub-id>
          <pub-id pub-id-type="pii">PCOMPBIOL-D-14-01771</pub-id>
          <pub-id pub-id-type="pmcid">PMC4431683</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Molinari</surname>
              <given-names>NM</given-names>
            </name>
            <name name-style="western">
              <surname>Ortega-Sanchez</surname>
              <given-names>IR</given-names>
            </name>
            <name name-style="western">
              <surname>Messonnier</surname>
              <given-names>ML</given-names>
            </name>
            <name name-style="western">
              <surname>Thompson</surname>
              <given-names>WW</given-names>
            </name>
            <name name-style="western">
              <surname>Wortley</surname>
              <given-names>PM</given-names>
            </name>
            <name name-style="western">
              <surname>Weintraub</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Bridges</surname>
              <given-names>CB</given-names>
            </name>
          </person-group>
          <article-title>The annual impact of seasonal influenza in the US: measuring disease burden and costs</article-title>
          <source>Vaccine</source>
          <year>2007</year>
          <month>06</month>
          <day>28</day>
          <volume>25</volume>
          <issue>27</issue>
          <fpage>5086</fpage>
          <lpage>96</lpage>
          <pub-id pub-id-type="doi">10.1016/j.vaccine.2007.03.046</pub-id>
          <pub-id pub-id-type="medline">17544181</pub-id>
          <pub-id pub-id-type="pii">S0264-410X(07)00385-4</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Putri</surname>
              <given-names>WC</given-names>
            </name>
            <name name-style="western">
              <surname>Muscatello</surname>
              <given-names>DJ</given-names>
            </name>
            <name name-style="western">
              <surname>Stockwell</surname>
              <given-names>MS</given-names>
            </name>
            <name name-style="western">
              <surname>Newall</surname>
              <given-names>AT</given-names>
            </name>
          </person-group>
          <article-title>Economic burden of seasonal influenza in the United States</article-title>
          <source>Vaccine</source>
          <year>2018</year>
          <month>06</month>
          <day>22</day>
          <volume>36</volume>
          <issue>27</issue>
          <fpage>3960</fpage>
          <lpage>6</lpage>
          <pub-id pub-id-type="doi">10.1016/j.vaccine.2018.05.057</pub-id>
          <pub-id pub-id-type="medline">29801998</pub-id>
          <pub-id pub-id-type="pii">S0264-410X(18)30677-7</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Dugas</surname>
              <given-names>AF</given-names>
            </name>
            <name name-style="western">
              <surname>Jalalpour</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Gel</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Levin</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Torcaso</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Igusa</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Rothman</surname>
              <given-names>RE</given-names>
            </name>
          </person-group>
          <article-title>Influenza forecasting with Google flu trends</article-title>
          <source>PLoS One</source>
          <year>2013</year>
          <volume>8</volume>
          <issue>2</issue>
          <fpage>e56176</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://dx.plos.org/10.1371/journal.pone.0056176"/>
          </comment>
          <pub-id pub-id-type="doi">10.1371/journal.pone.0056176</pub-id>
          <pub-id pub-id-type="medline">23457520</pub-id>
          <pub-id pub-id-type="pii">PONE-D-12-29961</pub-id>
          <pub-id pub-id-type="pmcid">PMC3572967</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="web">
          <source>Centers for Disease Control and Prevention</source>
          <access-date>2019-10-11</access-date>
          <comment>Glossary of Influenza (Flu) Terms<ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.cdc.gov/flu/about/glossary.htm">https://www.cdc.gov/flu/about/glossary.htm</ext-link>
                                                </comment>
        </nlm-citation>
      </ref>
      <ref id="ref6">
        <label>6</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Priedhorsky</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Osthus</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Daughton</surname>
              <given-names>AR</given-names>
            </name>
            <name name-style="western">
              <surname>Moran</surname>
              <given-names>KR</given-names>
            </name>
            <name name-style="western">
              <surname>Generous</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Fairchild</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Deshpande</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>del Valle</surname>
              <given-names>SY</given-names>
            </name>
          </person-group>
          <article-title>Measuring global disease with Wikipedia: success, failure, and a research agenda</article-title>
          <source>CSCW Conf Comput Support Coop Work</source>
          <year>2017</year>
          <volume>2017</volume>
          <fpage>1812</fpage>
          <lpage>34</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/28782059"/>
          </comment>
          <pub-id pub-id-type="doi">10.1145/2998181.2998183</pub-id>
          <pub-id pub-id-type="medline">28782059</pub-id>
          <pub-id pub-id-type="pmcid">PMC5542563</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref7">
        <label>7</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Generous</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Fairchild</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Deshpande</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>del Valle</surname>
              <given-names>SY</given-names>
            </name>
            <name name-style="western">
              <surname>Priedhorsky</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>Global disease monitoring and forecasting with Wikipedia</article-title>
          <source>PLoS Comput Biol</source>
          <year>2014</year>
          <month>11</month>
          <volume>10</volume>
          <issue>11</issue>
          <fpage>e1003892</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://dx.plos.org/10.1371/journal.pcbi.1003892"/>
          </comment>
          <pub-id pub-id-type="doi">10.1371/journal.pcbi.1003892</pub-id>
          <pub-id pub-id-type="medline">25392913</pub-id>
          <pub-id pub-id-type="pii">PCOMPBIOL-D-14-00678</pub-id>
          <pub-id pub-id-type="pmcid">PMC4231164</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref8">
        <label>8</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>McIver</surname>
              <given-names>DJ</given-names>
            </name>
            <name name-style="western">
              <surname>Brownstein</surname>
              <given-names>JS</given-names>
            </name>
          </person-group>
          <article-title>Wikipedia usage estimates prevalence of influenza-like illness in the United States in near real-time</article-title>
          <source>PLoS Comput Biol</source>
          <year>2014</year>
          <month>04</month>
          <volume>10</volume>
          <issue>4</issue>
          <fpage>e1003581</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://dx.plos.org/10.1371/journal.pcbi.1003581"/>
          </comment>
          <pub-id pub-id-type="doi">10.1371/journal.pcbi.1003581</pub-id>
          <pub-id pub-id-type="medline">24743682</pub-id>
          <pub-id pub-id-type="pii">PCOMPBIOL-D-13-02242</pub-id>
          <pub-id pub-id-type="pmcid">PMC3990502</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref9">
        <label>9</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Polgreen</surname>
              <given-names>PM</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Pennock</surname>
              <given-names>DM</given-names>
            </name>
            <name name-style="western">
              <surname>Nelson</surname>
              <given-names>FD</given-names>
            </name>
            <name name-style="western">
              <surname>Weinstein</surname>
              <given-names>RA</given-names>
            </name>
          </person-group>
          <article-title>Using internet searches for influenza surveillance</article-title>
          <source>Clin Infect Dis</source>
          <year>2008</year>
          <month>12</month>
          <day>1</day>
          <volume>47</volume>
          <issue>11</issue>
          <fpage>1443</fpage>
          <lpage>8</lpage>
          <pub-id pub-id-type="doi">10.1086/593098</pub-id>
          <pub-id pub-id-type="medline">18954267</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref10">
        <label>10</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Seok</surname>
              <given-names>JH</given-names>
            </name>
            <name name-style="western">
              <surname>Oh</surname>
              <given-names>JS</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>HW</given-names>
            </name>
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>KH</given-names>
            </name>
          </person-group>
          <article-title>Use of Hangeul Twitter to track and predict human influenza infection</article-title>
          <source>PLoS One</source>
          <year>2013</year>
          <volume>8</volume>
          <issue>7</issue>
          <fpage>e69305</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://dx.plos.org/10.1371/journal.pone.0069305"/>
          </comment>
          <pub-id pub-id-type="doi">10.1371/journal.pone.0069305</pub-id>
          <pub-id pub-id-type="medline">23894447</pub-id>
          <pub-id pub-id-type="pii">PONE-D-12-37217</pub-id>
          <pub-id pub-id-type="pmcid">PMC3722273</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref11">
        <label>11</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Baltrusaitis</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Brownstein</surname>
              <given-names>JS</given-names>
            </name>
            <name name-style="western">
              <surname>Scarpino</surname>
              <given-names>SV</given-names>
            </name>
            <name name-style="western">
              <surname>Bakota</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Crawley</surname>
              <given-names>AW</given-names>
            </name>
            <name name-style="western">
              <surname>Conidi</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Gunn</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Gray</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Zink</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Santillana</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Comparison of crowd-sourced, electronic health records based, and traditional health-care based influenza-tracking systems at multiple spatial resolutions in the United States of America</article-title>
          <source>BMC Infect Dis</source>
          <year>2018</year>
          <month>08</month>
          <day>15</day>
          <volume>18</volume>
          <issue>1</issue>
          <fpage>403</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://bmcinfectdis.biomedcentral.com/articles/10.1186/s12879-018-3322-3"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/s12879-018-3322-3</pub-id>
          <pub-id pub-id-type="medline">30111305</pub-id>
          <pub-id pub-id-type="pii">10.1186/s12879-018-3322-3</pub-id>
          <pub-id pub-id-type="pmcid">PMC6094455</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref12">
        <label>12</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Brownstein</surname>
              <given-names>JS</given-names>
            </name>
            <name name-style="western">
              <surname>Chu</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Marathe</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Marathe</surname>
              <given-names>MV</given-names>
            </name>
            <name name-style="western">
              <surname>Nguyen</surname>
              <given-names>AT</given-names>
            </name>
            <name name-style="western">
              <surname>Paolotti</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Perra</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Perrotta</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Santillana</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Swarup</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Tizzoni</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Vespignani</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Vullikanti</surname>
              <given-names>AK</given-names>
            </name>
            <name name-style="western">
              <surname>Wilson</surname>
              <given-names>ML</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>Q</given-names>
            </name>
          </person-group>
          <article-title>Combining participatory influenza surveillance with modeling and forecasting: three alternative approaches</article-title>
          <source>JMIR Public Health Surveill</source>
          <year>2017</year>
          <month>11</month>
          <day>1</day>
          <volume>3</volume>
          <issue>4</issue>
          <fpage>e83</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://publichealth.jmir.org/2017/4/e83/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/publichealth.7344</pub-id>
          <pub-id pub-id-type="medline">29092812</pub-id>
          <pub-id pub-id-type="pii">v3i4e83</pub-id>
          <pub-id pub-id-type="pmcid">PMC5688248</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref13">
        <label>13</label>
        <nlm-citation citation-type="web">
          <source>Pew Research Center</source>
          <year>2014</year>
          <month>02</month>
          <day>28</day>
          <access-date>2016-07-11</access-date>
          <comment>Health Fact Sheet<ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.pewresearch.org/internet/health-fact-sheet-copy/">https://www.pewresearch.org/internet/health-fact-sheet-copy/</ext-link>
                                                </comment>
        </nlm-citation>
      </ref>
      <ref id="ref14">
        <label>14</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Fox</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <source>Pew Research Center</source>
          <year>2011</year>
          <access-date>2016-07-11</access-date>
          <comment>Health Topics<ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://pewInternet.org/2011/02/01/health-topics-2/">http://pewInternet.org/2011/02/01/health-topics-2/</ext-link>
                                                </comment>
        </nlm-citation>
      </ref>
      <ref id="ref15">
        <label>15</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ginsberg</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Mohebbi</surname>
              <given-names>MH</given-names>
            </name>
            <name name-style="western">
              <surname>Patel</surname>
              <given-names>RS</given-names>
            </name>
            <name name-style="western">
              <surname>Brammer</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Smolinski</surname>
              <given-names>MS</given-names>
            </name>
            <name name-style="western">
              <surname>Brilliant</surname>
              <given-names>L</given-names>
            </name>
          </person-group>
          <article-title>Detecting influenza epidemics using search engine query data</article-title>
          <source>Nature</source>
          <year>2009</year>
          <month>02</month>
          <day>19</day>
          <volume>457</volume>
          <issue>7232</issue>
          <fpage>1012</fpage>
          <lpage>4</lpage>
          <pub-id pub-id-type="doi">10.1038/nature07634</pub-id>
          <pub-id pub-id-type="medline">19020500</pub-id>
          <pub-id pub-id-type="pii">nature07634</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref16">
        <label>16</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Chretien</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>George</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Shaman</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Chitale</surname>
              <given-names>RA</given-names>
            </name>
            <name name-style="western">
              <surname>McKenzie</surname>
              <given-names>FE</given-names>
            </name>
          </person-group>
          <article-title>Influenza forecasting in human populations: a scoping review</article-title>
          <source>PLoS One</source>
          <year>2014</year>
          <volume>9</volume>
          <issue>4</issue>
          <fpage>e94130</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://dx.plos.org/10.1371/journal.pone.0094130"/>
          </comment>
          <pub-id pub-id-type="doi">10.1371/journal.pone.0094130</pub-id>
          <pub-id pub-id-type="medline">24714027</pub-id>
          <pub-id pub-id-type="pii">PONE-D-13-53481</pub-id>
          <pub-id pub-id-type="pmcid">PMC3979760</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref17">
        <label>17</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Perrotta</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Tizzoni</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Paolotti</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>Using Participatory Web-Based Surveillance Data to Improve Seasonal Influenza Forecasting in Italy</article-title>
          <source>Proceedings of the 26th International Conference on World Wide Web</source>
          <year>2017</year>
          <conf-name>WWW'17</conf-name>
          <conf-date>April 3-7, 2017</conf-date>
          <conf-loc>Perth, Australia</conf-loc>
          <fpage>303</fpage>
          <lpage>10</lpage>
          <pub-id pub-id-type="doi">10.1145/3038912.3052670</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref18">
        <label>18</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Shaman</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Karspeck</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Forecasting seasonal outbreaks of influenza</article-title>
          <source>Proc Natl Acad Sci U S A</source>
          <year>2012</year>
          <month>12</month>
          <day>11</day>
          <volume>109</volume>
          <issue>50</issue>
          <fpage>20425</fpage>
          <lpage>30</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://www.pnas.org/cgi/pmidlookup?view=long&#38;pmid=23184969"/>
          </comment>
          <pub-id pub-id-type="doi">10.1073/pnas.1208772109</pub-id>
          <pub-id pub-id-type="medline">23184969</pub-id>
          <pub-id pub-id-type="pii">1208772109</pub-id>
          <pub-id pub-id-type="pmcid">PMC3528592</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref19">
        <label>19</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lampos</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Zou</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Cox</surname>
              <given-names>IJ</given-names>
            </name>
          </person-group>
          <article-title>Enhancing Feature Selection Using Word Embeddings: The Case of Flu Surveillance</article-title>
          <source>Proceedings of the 26th International Conference on World Wide Web</source>
          <year>2017</year>
          <conf-name>WWW'17</conf-name>
          <conf-date>April 3-7, 2017</conf-date>
          <conf-loc>Perth, Australia</conf-loc>
          <fpage>695</fpage>
          <lpage>704</lpage>
          <pub-id pub-id-type="doi">10.1145/3038912.3052622</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref20">
        <label>20</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Xu</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Ma</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Cui</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Ru</surname>
              <given-names>L</given-names>
            </name>
          </person-group>
          <article-title>Predicting Epidemic Tendency through Search Behavior Analysis</article-title>
          <source>Proceedings of the Twenty-Second International Joint Conference on Artificial Intelligence</source>
          <year>2011</year>
          <conf-name>IJCAI'11</conf-name>
          <conf-date>July 16-22, 2011</conf-date>
          <conf-loc>Barcelona, Spain</conf-loc>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.ijcai.org/Proceedings/11/Papers/393.pdf"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref21">
        <label>21</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Jia-xing</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Ben-fu</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Geng</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Na</surname>
              <given-names>L</given-names>
            </name>
          </person-group>
          <article-title>Gonorrhea Incidence Forecasting Research Based on Baidu Search Data</article-title>
          <source>International Conference on Management Science and Engineering 20th Annual Conference Proceedings</source>
          <year>2013</year>
          <conf-name>ICMSE'13</conf-name>
          <conf-date>July 17-19, 2013</conf-date>
          <conf-loc>Harbin, China</conf-loc>
          <fpage>36</fpage>
          <lpage>42</lpage>
          <pub-id pub-id-type="doi">10.1109/icmse.2013.6586259</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref22">
        <label>22</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Bardak</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Tan</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Prediction of Influenza Outbreaks by Integrating Wikipedia Article Access Logs and Google Flu Trend Data</article-title>
          <source>Proceedings of the 15th International Conference on Bioinformatics and Bioengineering</source>
          <year>2015</year>
          <conf-name>BIBE'15</conf-name>
          <conf-date>November 2-4, 2015</conf-date>
          <conf-loc>Belgrade, Serbia</conf-loc>
          <fpage>1</fpage>
          <lpage>6</lpage>
          <pub-id pub-id-type="doi">10.1109/bibe.2015.7367640</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref23">
        <label>23</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Biggerstaff</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Alper</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Dredze</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Fox</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Fung</surname>
              <given-names>IC</given-names>
            </name>
            <name name-style="western">
              <surname>Hickmann</surname>
              <given-names>KS</given-names>
            </name>
            <name name-style="western">
              <surname>Lewis</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Rosenfeld</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Shaman</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Tsou</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Velardi</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Vespignani</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Finelli</surname>
              <given-names>L</given-names>
            </name>
            <collab>Influenza Forecasting Contest Working Group</collab>
          </person-group>
          <article-title>Results from the centers for disease control and prevention's predict the 2013-2014 Influenza Season Challenge</article-title>
          <source>BMC Infect Dis</source>
          <year>2016</year>
          <month>07</month>
          <day>22</day>
          <volume>16</volume>
          <fpage>357</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://bmcinfectdis.biomedcentral.com/articles/10.1186/s12879-016-1669-x"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/s12879-016-1669-x</pub-id>
          <pub-id pub-id-type="medline">27449080</pub-id>
          <pub-id pub-id-type="pii">10.1186/s12879-016-1669-x</pub-id>
          <pub-id pub-id-type="pmcid">PMC4957319</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref24">
        <label>24</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Broniatowski</surname>
              <given-names>DA</given-names>
            </name>
            <name name-style="western">
              <surname>Paul</surname>
              <given-names>MJ</given-names>
            </name>
            <name name-style="western">
              <surname>Dredze</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>National and local influenza surveillance through Twitter: an analysis of the 2012-2013 influenza epidemic</article-title>
          <source>PLoS One</source>
          <year>2013</year>
          <volume>8</volume>
          <issue>12</issue>
          <fpage>e83672</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://dx.plos.org/10.1371/journal.pone.0083672"/>
          </comment>
          <pub-id pub-id-type="doi">10.1371/journal.pone.0083672</pub-id>
          <pub-id pub-id-type="medline">24349542</pub-id>
          <pub-id pub-id-type="pii">PONE-D-13-35058</pub-id>
          <pub-id pub-id-type="pmcid">PMC3857320</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref25">
        <label>25</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Perra</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Perrotta</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Tizzoni</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Paolotti</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Vespignani</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Forecasting Seasonal Influenza Fusing Digital Indicators and a Mechanistic Disease Model</article-title>
          <source>Proceedings of the 26th International Conference on World Wide Web</source>
          <year>2017</year>
          <conf-name>WWW'17</conf-name>
          <conf-date>April 3-7, 2017</conf-date>
          <conf-loc>Perth, Australia</conf-loc>
          <fpage>311</fpage>
          <lpage>9</lpage>
          <pub-id pub-id-type="doi">10.1145/3038912.3052678</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref26">
        <label>26</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Paul</surname>
              <given-names>MJ</given-names>
            </name>
            <name name-style="western">
              <surname>Dredze</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Broniatowski</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>Twitter improves influenza forecasting</article-title>
          <source>PLoS Curr</source>
          <year>2014</year>
          <month>10</month>
          <day>28</day>
          <volume>6</volume>
          <fpage>-</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://dx.doi.org/10.1371/currents.outbreaks.90b9ed0f59bae4ccaa683a39865d9117"/>
          </comment>
          <pub-id pub-id-type="doi">10.1371/currents.outbreaks.90b9ed0f59bae4ccaa683a39865d9117</pub-id>
          <pub-id pub-id-type="medline">25642377</pub-id>
          <pub-id pub-id-type="pmcid">PMC4234396</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref27">
        <label>27</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Santillana</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Nguyen</surname>
              <given-names>AT</given-names>
            </name>
            <name name-style="western">
              <surname>Dredze</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Paul</surname>
              <given-names>MJ</given-names>
            </name>
            <name name-style="western">
              <surname>Nsoesie</surname>
              <given-names>EO</given-names>
            </name>
            <name name-style="western">
              <surname>Brownstein</surname>
              <given-names>JS</given-names>
            </name>
          </person-group>
          <article-title>Combining search, social media, and traditional data sources to improve influenza surveillance</article-title>
          <source>PLoS Comput Biol</source>
          <year>2015</year>
          <month>10</month>
          <volume>11</volume>
          <issue>10</issue>
          <fpage>e1004513</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://dx.plos.org/10.1371/journal.pcbi.1004513"/>
          </comment>
          <pub-id pub-id-type="doi">10.1371/journal.pcbi.1004513</pub-id>
          <pub-id pub-id-type="medline">26513245</pub-id>
          <pub-id pub-id-type="pii">PCOMPBIOL-D-15-00856</pub-id>
          <pub-id pub-id-type="pmcid">PMC4626021</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref28">
        <label>28</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lampos</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Miller</surname>
              <given-names>AC</given-names>
            </name>
            <name name-style="western">
              <surname>Crossan</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Stefansen</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>Advances in nowcasting influenza-like illness rates using search query logs</article-title>
          <source>Sci Rep</source>
          <year>2015</year>
          <month>08</month>
          <day>3</day>
          <volume>5</volume>
          <fpage>12760</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://dx.doi.org/10.1038/srep12760"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/srep12760</pub-id>
          <pub-id pub-id-type="medline">26234783</pub-id>
          <pub-id pub-id-type="pii">srep12760</pub-id>
          <pub-id pub-id-type="pmcid">PMC4522652</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref29">
        <label>29</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Santillana</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Kou</surname>
              <given-names>SC</given-names>
            </name>
          </person-group>
          <article-title>Accurate estimation of influenza epidemics using Google search data via ARGO</article-title>
          <source>Proc Natl Acad Sci USA</source>
          <year>2015</year>
          <month>11</month>
          <day>24</day>
          <volume>112</volume>
          <issue>47</issue>
          <fpage>14473</fpage>
          <lpage>8</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://www.pnas.org/cgi/pmidlookup?view=long&#38;pmid=26553980"/>
          </comment>
          <pub-id pub-id-type="doi">10.1073/pnas.1515373112</pub-id>
          <pub-id pub-id-type="medline">26553980</pub-id>
          <pub-id pub-id-type="pii">1515373112</pub-id>
          <pub-id pub-id-type="pmcid">PMC4664296</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref30">
        <label>30</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Johnson</surname>
              <given-names>HA</given-names>
            </name>
            <name name-style="western">
              <surname>Wagner</surname>
              <given-names>MM</given-names>
            </name>
            <name name-style="western">
              <surname>Hogan</surname>
              <given-names>WR</given-names>
            </name>
            <name name-style="western">
              <surname>Chapman</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Olszewski</surname>
              <given-names>RT</given-names>
            </name>
            <name name-style="western">
              <surname>Dowling</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Barnas</surname>
              <given-names>G</given-names>
            </name>
          </person-group>
          <article-title>Analysis of web access logs for surveillance of influenza</article-title>
          <source>Stud Health Technol Inform</source>
          <year>2004</year>
          <volume>107</volume>
          <issue>Pt 2</issue>
          <fpage>1202</fpage>
          <lpage>6</lpage>
          <pub-id pub-id-type="medline">15361003</pub-id>
          <pub-id pub-id-type="pii">D040005485</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref31">
        <label>31</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Seabold</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Perktold</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Statsmodels: Econometric and Statistical Modeling with Python</article-title>
          <source>Proceedings of the 9th Python in Science Conference</source>
          <year>2010</year>
          <conf-name>SCIPY'10</conf-name>
          <conf-date>June 28 - July 3, 2010</conf-date>
          <conf-loc>Austin, Texas</conf-loc>
          <fpage>92</fpage>
          <lpage>6</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://conference.scipy.org/proceedings/scipy2010/pdfs/seabold.pdf"/>
          </comment>
        </nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
