<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article" dtd-version="2.0">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JMIR</journal-id>
      <journal-id journal-id-type="nlm-ta">J Med Internet Res</journal-id>
      <journal-title>Journal of Medical Internet Research</journal-title>
      <issn pub-type="epub">1438-8871</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
    <article-id pub-id-type="publisher-id">v18i7e177</article-id>
    <article-id pub-id-type="pmid">27377323</article-id>
    <article-id pub-id-type="doi">10.2196/jmir.4955</article-id>
    <article-categories>
      <subj-group subj-group-type="heading">
        <subject>Original Paper</subject>
      </subj-group>
      <subj-group subj-group-type="article-type">
        <subject>Original Paper</subject>
      </subj-group>
    </article-categories>
    <title-group>
      <article-title>Estimating Influenza Outbreaks Using Both Search Engine Query Data and Social Media Data in South Korea</article-title>
    </title-group>
    <contrib-group>
      <contrib contrib-type="editor">
        <name>
          <surname>Eysenbach</surname>
          <given-names>Gunther</given-names>
        </name>
      </contrib>
    </contrib-group>
    <contrib-group>
      <contrib contrib-type="reviewer">
        <name>
          <surname>Nsoesie</surname>
          <given-names>Elaine</given-names>
        </name>
      </contrib>
      <contrib contrib-type="reviewer">
        <name>
          <surname>Shin</surname>
          <given-names>Soo-Yong</given-names>
        </name>
      </contrib>
    </contrib-group>
    <contrib-group>
      <contrib contrib-type="author" id="contrib1">
        <name name-style="western">
          <surname>Woo</surname>
          <given-names>Hyekyung</given-names>
        </name>
        <degrees>MPH, PhD</degrees>
        <xref rid="aff1" ref-type="aff">1</xref>
        <ext-link ext-link-type="orcid">http://orcid.org/0000-0001-5489-3404</ext-link>
      </contrib>
      <contrib contrib-type="author" id="contrib2" corresp="yes">
      <name name-style="western">
        <surname>Cho</surname>
        <given-names>Youngtae</given-names>
      </name>
      <degrees>PhD</degrees>
      <xref rid="aff1" ref-type="aff">1</xref>
      <address>
        <institution>Department of Health Science and Service</institution>
        <institution>School of Public Health</institution>
        <institution>Seoul National University</institution>
        <addr-line>1 Kwanakro</addr-line>
        <addr-line>Kwanakgu</addr-line>
        <addr-line>Seoul, 151-172</addr-line>
        <country>Republic Of Korea</country>
        <phone>82 10 7135 4610</phone>
        <fax>82 2 762 9105</fax>
        <email>youngtae@snu.ac.kr</email>
      </address>  
      <ext-link ext-link-type="orcid">http://orcid.org/0000-0003-1641-282X</ext-link></contrib>
      <contrib contrib-type="author" id="contrib3">
        <name name-style="western">
          <surname>Shim</surname>
          <given-names>Eunyoung</given-names>
        </name>
        <degrees>MPH, PhD</degrees>
        <xref rid="aff1" ref-type="aff">1</xref>
        <ext-link ext-link-type="orcid">http://orcid.org/0000-0002-0730-2955</ext-link>
      </contrib>
      <contrib contrib-type="author" id="contrib4">
        <name name-style="western">
          <surname>Lee</surname>
          <given-names>Jong-Koo</given-names>
        </name>
        <degrees>MD, PhD</degrees>
        <xref rid="aff2" ref-type="aff">2</xref>
        <ext-link ext-link-type="orcid">http://orcid.org/0000-0003-4833-1178</ext-link>
      </contrib>
      <contrib contrib-type="author" id="contrib5">
        <name name-style="western">
          <surname>Lee</surname>
          <given-names>Chang-Gun</given-names>
        </name>
        <degrees>PhD</degrees>
        <xref rid="aff3" ref-type="aff">3</xref>
        <ext-link ext-link-type="orcid">http://orcid.org/0000-0001-7434-0495</ext-link>
      </contrib>
      <contrib contrib-type="author" id="contrib6">
        <name name-style="western">
          <surname>Kim</surname>
          <given-names>Seong Hwan</given-names>
        </name>
        <degrees>PhD</degrees>
        <xref rid="aff4" ref-type="aff">4</xref>
        <ext-link ext-link-type="orcid">http://orcid.org/0000-0002-0406-7929</ext-link>
      </contrib>
    </contrib-group>
    <aff id="aff1">
    <sup>1</sup>
    <institution>Department of Health Science and Service</institution>
    <institution>School of Public Health</institution>  
    <institution>Seoul National University</institution>  
    <addr-line>Seoul</addr-line>
    <country>Republic Of Korea</country></aff>
    <aff id="aff2">
    <sup>2</sup>
    <institution>Global Medical Center</institution>
    <institution>School of Medicine</institution>  
    <institution>Seoul National University</institution>  
    <addr-line>Seoul</addr-line>
    <country>Republic Of Korea</country></aff>
    <aff id="aff3">
    <sup>3</sup>
    <institution>Real-time Ubiquitous System Laboratory</institution>
    <institution>Department of Computer Science and Engineering</institution>  
    <institution>Seoul National University</institution>  
    <addr-line>Seoul</addr-line>
    <country>Republic Of Korea</country></aff>
    <aff id="aff4">
    <sup>4</sup>
    <institution>Search SU</institution>
    <institution>Datamining Team</institution>  
    <institution>Daum Kakao Incorporated</institution>  
    <addr-line>Seoul</addr-line>
    <country>Republic Of Korea</country></aff>
    <author-notes>
      <corresp>Corresponding Author: Youngtae Cho 
      <email>youngtae@snu.ac.kr</email></corresp>
    </author-notes>
    <pub-date pub-type="collection"><month>07</month><year>2016</year></pub-date>
    <pub-date pub-type="epub">
      <day>04</day>
      <month>07</month>
      <year>2016</year>
    </pub-date>
    <volume>18</volume>
    <issue>7</issue>
    <elocation-id>e177</elocation-id>
    <!--history from ojs - api-xml-->
    <history>
      <date date-type="received">
        <day>25</day>
        <month>7</month>
        <year>2015</year>
      </date>
      <date date-type="rev-request">
        <day>3</day>
        <month>9</month>
        <year>2015</year>
      </date>
      <date date-type="rev-recd">
        <day>17</day>
        <month>4</month>
        <year>2016</year>
      </date>
      <date date-type="accepted">
        <day>19</day>
        <month>5</month>
        <year>2016</year>
      </date>
    </history>
    <!--(c) the authors - correct author names and publication date here if necessary. Date in form ', dd.mm.yyyy' after jmir.org-->
    <copyright-statement>©Hyekyung Woo, Youngtae Cho, Eunyoung Shim, Jong-Koo Lee, Chang-Gun Lee, Seong Hwan Kim. Originally published in the Journal of Medical Internet Research (http://www.jmir.org), 04.07.2016.</copyright-statement>
    <copyright-year>2016</copyright-year>
    <license license-type="open-access" xlink:href="http://creativecommons.org/licenses/by/2.0/">
      <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (http://creativecommons.org/licenses/by/2.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in the Journal of Medical Internet Research, is properly cited. The complete bibliographic information, a link to the original publication on http://www.jmir.org/, as well as this copyright and license information must be included.</p>
    </license>  
    <self-uri xlink:href="http://www.jmir.org/2016/7/e177/" xlink:type="simple"/>
    <abstract>
      <sec sec-type="background">
        <title>Background</title>
        <p>As suggested as early as in 2006, logs of queries submitted to search engines seeking information could be a source for detection of emerging influenza epidemics if changes in the volume of search queries are monitored (infodemiology). However, selecting queries that are most likely to be associated with influenza epidemics is a particular challenge when it comes to generating better predictions.</p>
      </sec>
      <sec sec-type="objective">
        <title>Objective</title>
        <p>In this study, we describe a methodological extension for detecting influenza outbreaks using search query data; we provide a new approach for query selection through the exploration of contextual information gleaned from social media data. Additionally, we evaluate whether it is possible to use these queries for monitoring and predicting influenza epidemics in South Korea.</p>
      </sec>
      <sec sec-type="methods">
        <title>Methods</title>
        <p>Our study was based on freely available weekly influenza incidence data and query data originating from the search engine on the Korean website Daum between April 3, 2011 and April 5, 2014. To select queries related to influenza epidemics, several approaches were applied: (1) exploring influenza-related words in social media data, (2) identifying the chief concerns related to influenza, and (3) using Web query recommendations. Optimal feature selection by least absolute shrinkage and selection operator (Lasso) and support vector machine for regression (SVR) were used to construct a model predicting influenza epidemics.</p>
      </sec>
      <sec sec-type="results">
        <title>Results</title>
        <p>In total, 146 queries related to influenza were generated through our initial query selection approach. A considerable proportion of optimal features for final models were derived from queries with reference to the social media data. The SVR model performed well: the prediction values were highly correlated with the recent observed influenza-like illness (<italic>r</italic>=.956; <italic>P</italic>&#60;.001) and virological incidence rate (<italic>r</italic>=.963; <italic>P</italic>&#60;.001).</p>
      </sec>
      <sec sec-type="conclusions">
        <title>Conclusions</title>
        <p>These results demonstrate the feasibility of using search queries to enhance influenza surveillance in South Korea. In addition, an approach for query selection using social media data seems ideal for supporting influenza surveillance based on search query data.</p>
      </sec>
    </abstract>
    <kwd-group>
      <kwd>influenza</kwd>
      <kwd>surveillance</kwd>
      <kwd>population surveillance</kwd>
      <kwd>infodemiology</kwd>
      <kwd>infoveillance</kwd>
      <kwd>Internet search</kwd>
      <kwd>query</kwd>
      <kwd>social media</kwd>
      <kwd>big data</kwd>
      <kwd>forecasting</kwd>
      <kwd>epidemiology</kwd>
      <kwd>early response</kwd>
    </kwd-group></article-meta>
  </front>
  <body>
    <sec sec-type="introduction">
      <title>Introduction</title>
      <p>An early and now well-known example of utilizing Internet data for a health-related applications came from the estimation of influenza incidence using anonymous logs of Web search engine queries. First proposed in 2006 by Eysenbach under the umbrella term “infodemiology”, numerous recent studies have added further evidence of a correlation between search query data from Google [<xref ref-type="bibr" rid="ref1">1</xref>-<xref ref-type="bibr" rid="ref3">3</xref>], Yahoo! [<xref ref-type="bibr" rid="ref4">4</xref>], Baidu [<xref ref-type="bibr" rid="ref5">5</xref>], or other medical websites [<xref ref-type="bibr" rid="ref6">6</xref>] and traditional data used for influenza surveillance, such as influenza-like illness (ILI) and/or laboratory-confirmed data. These studies indicate that individuals faced with disease or ill health will search for information on the Internet regarding their state of health and possible countermeasures to illness; logs of queries submitted to search engines by individuals seeking this information are potential sources of information for detecting emerging epidemics, as it is possible to track changes in the volumes of specific search queries. However, the recent errors arising from Google Flu Trends, which has been predominantly used in previous studies, serves as a reminder to investigators that this novel data paradigm calls for critical assessment and the development of more empirical methodologies to explore the predictive utility of big data [<xref ref-type="bibr" rid="ref7">7</xref>,<xref ref-type="bibr" rid="ref8">8</xref>]. It is clear that current and future studies need to focus on methods to more precisely identify the particular phases associated with influenza epidemics based on data from these highly informative sources.</p>
      <p>Selecting the queries that are most likely to be associated with influenza epidemics poses a particular challenge for the generation of improved predictions. In previous studies, researchers have utilized queries selected by various methods, such as specific keyword tools offered by particular websites [<xref ref-type="bibr" rid="ref5">5</xref>], surveys of patients who visited the emergency room [<xref ref-type="bibr" rid="ref1">1</xref>,<xref ref-type="bibr" rid="ref9">9</xref>], or common knowledge about influenza including the definition of ILI [<xref ref-type="bibr" rid="ref9">9</xref>,<xref ref-type="bibr" rid="ref10">10</xref>], as well as fully automated methods for identifying queries related to influenza from search logs [<xref ref-type="bibr" rid="ref3">3</xref>,<xref ref-type="bibr" rid="ref4">4</xref>,<xref ref-type="bibr" rid="ref6">6</xref>]. Because researchers do not have full access to search logs, an approach using social media data may also be helpful for obtaining information for query selection. Recently, social media data have been highlighted as an additional potential data source for disease surveillance because they contain a greater variety of contextual health information with diverse descriptions of health states. Thus, it could be a useful reference point for researchers who wish to select initial target queries in query-based prediction.</p>
      <p>In South Korea, there is currently no forecasting system for infectious disease based on search query data [<xref ref-type="bibr" rid="ref1">1</xref>,<xref ref-type="bibr" rid="ref9">9</xref>], despite the high availability and use of the Internet in Korea [<xref ref-type="bibr" rid="ref11">11</xref>]. Moreover, few studies thus far have evaluated whether such data could be of value in national influenza forecasting [<xref ref-type="bibr" rid="ref1">1</xref>,<xref ref-type="bibr" rid="ref9">9</xref>], and a recent study has suggested that Google Trends in the Korean language is insufficient for use as a model for influenza prediction in South Korea [<xref ref-type="bibr" rid="ref1">1</xref>]. We need to proactively determine whether queries of search engines that are more widely used by Koreans have the capacity to enhance traditional influenza surveillance systems in South Korea. We consider the use of social media data to select queries that are most likely to be associated with influenza epidemics in a situation involving limited access to search logs. An attempt to exploit the complementary nature of two types of data sources could result in a rapid and efficient prediction of the occurrence of influenza and their proliferation, thereby allowing for better recognition of influenza and initiation of preventive measures.</p>
      <p>The purpose of this study was to further explore two concerns: (1) to describe a methodological extension for detecting influenza outbreaks using search query data, providing a new approach for query selection through the exploration of contextual information obtained from social media data, and (2) evaluate whether it is possible to use these queries for monitoring influenza epidemics in South Korea.</p>
    </sec>
    <sec sec-type="methods">
      <title>Methods</title>
      <sec>
        <title>Data Sources</title>
        <sec>
          <title>Epidemiological Surveillance Data</title>
          <p>National influenza surveillance data were obtained from the Korean Center for Disease Control and Prevention (KCDC), which routinely collects epidemiological data and national statistics pertaining to influenza incidence, typically with a 1-week reporting lag [<xref ref-type="bibr" rid="ref12">12</xref>]. We used clinical data and virological data from April 3, 2011 (listed as week 32) to April 5, 2014 (listed as week 14). For clinical data, we used the rates of physician visits for ILI; for virological data, the rates for positive results for the influenza virus in laboratory tests. The data obtained were anonymous and publicly available.</p>
        </sec>
        <sec>
          <title>Social Media Data</title>
          <p>In developing an approach for query selection, we drew on social media data. Social media data were collected from the daily Naver blog (a weblog service offered by the biggest portal site in South Korea [<xref ref-type="bibr" rid="ref13">13</xref>]) and Twitter posts between September 1, 2010 and August 31, 2013 (3 years), using the social “big data” mining system, SOCIALmetricsAcademy. This system contains social media data crawlers that collect posts from Twitter and the Naver blog. The system also processes text using state-of-the-art natural language processing and text-mining technologies. The Twitter crawler utilizes a streaming application program interface (API) for data collection using the “track keywords” function. We tracked several thousand keywords that were empirically selected and tuned to maximize the coverage of the crawler operating in near real time. We estimated that the daily coverage of the Twitter crawler was more than 80%. The collected posts were fed into a spam-filtering module that checked for posts containing spam keywords written by known spammers. The lists of spam keywords and spammers were semiautomatically monitored and managed. The Naver blog crawler resembles general-purpose Web crawlers, the main difference being that a list of active bloggers for post collection is maintained and automatically expanded. The estimated coverage of the Naver blog crawler was also more than 80%. We applied an extensive spam-filtering process similar to that of the Twitter crawler on the collected blog posts.</p>
          <p>The authors and data mining company conducted the search according to the Twitter and blogging website terms and conditions of use. All Twitter and Naver blog posts were publicly available and the information collected did not reveal the identity of the social media users; thus, user confidentiality was preserved.</p>
        </sec>
        <sec>
          <title>Search Engine Query Data</title>
          <p>The query data originated from the search engine on the Korean website, Daum [<xref ref-type="bibr" rid="ref14">14</xref>]. Although Google is the most-used search engine in the world, it is not dominant in South Korea. Local search engines based on the Korean language, such as Daum, are more widely used than Google. Daum is the second-largest search engine in the portal sites market of South Korea [<xref ref-type="bibr" rid="ref15">15</xref>]. Because the query data of Korean websites were not publicly available, we sent the list of target queries to Daum and received scaled volume data pertaining to the queries listed. Weekly relative volumes of queries submitted to the search engine between April 3, 2011 and April 5, 2014, were used for analysis. The relative volumes were calculated by dividing the number of each query by the total number of search queries in any given week. The website Daum is written in Korean, thus the submitted queries are primarily in Korean. No information was available that could have potentially revealed the identity of a website visitor; therefore, complete confidentiality was maintained.</p>
        </sec>
      </sec>
      <sec>
        <title>Query Selection</title>
        <p>To obtain queries related to influenza that were submitted to the Daum search engine by the Korean population at large, several approaches were applied. Search queries were obtained using the following methods.</p>
        <sec>
          <title>Seed Keyword for Exploring the Queries</title>
          <p>Although “influenza” is the official term used by the KCDC, <italic>dokgam</italic>, <italic>inpeulruenja</italic>, <italic>peulru</italic>, and <italic>sinjongpeulru</italic> are the words typically used in Korea to describe influenza. Since the 2009 pandemic of influenza virus A (H1N1), the term <italic>sinjongpeulru</italic> to describe the new strain of flu has been more popular in Korea than the term influenza A (H1N1). Thus, <italic>dokgam</italic>, <italic>inpeulruenja</italic>, <italic>peulru</italic>, <italic>sinjongpeulru</italic>, “influenza,” and “flu” were defined as seed keywords for exploring the queries. Because Web search queries typically consist of word combinations of an average of two or three terms [<xref ref-type="bibr" rid="ref16">16</xref>,<xref ref-type="bibr" rid="ref17">17</xref>], these seed keywords were also used as essential keywords in word combinations.</p>
        </sec>
        <sec>
          <title>Exploring Influenza-Related Words Through Social Media Data</title>
          <p>To obtain search queries related to influenza, we considered the words that usually appear with the word influenza in the accumulated posts submitted to Twitter and blogs. We first conducted synonym processing for the seed keywords of <italic>dokgam, inpeulruenja, peulru</italic>, <italic>sinjongpeulru</italic>, influenza, and flu, and named the resulting app Flu. Then, we investigated the words most likely to be associated with Flu using the accumulated posts during the critical 3-year period (between September 1, 2010 and August 31, 2013). Association analysis was performed to identify tuples of topic keyword and associated keywords. This analysis resulted in a total of 157 associated words.</p>
          <p>Certain words associated with influenza were not related to influenza seasons or were not commonly entered into search engines. We excluded keywords that occurred infrequently during the influenza season and those that showed nonsequential patterns in the time series throughout the tracking period. Although relatively rare, we also excluded Korean word combinations written in the form of an incomplete sentence. Therefore, we excluded words considered as inadequate candidates for search query following the keyword filtering; in our first phase, we generated 103 candidate queries of single words or word combinations consisting of seed keywords and/or words associated with influenza as determined using social media data.</p>
        </sec>
        <sec>
          <title>Identifying Chief Concerns Related to Influenza</title>
          <p>Some additional queries related to influenza were obtained through a review of influenza symptoms referring to patients’ chief concerns. The influenza surveillance system of the KCDC defines ILI as the sudden onset of high fever (38°C or greater) accompanied by a cough and/or sore throat. These symptoms, based on the definition of ILI, were included. Additionally, we included influenza symptom definitions used by the Centers for Disease Control and Prevention (CDC) [<xref ref-type="bibr" rid="ref18">18</xref>] and a consultative committee of medical doctors; this second phase generated 29 candidate queries of single words or word combinations consisting of seed keywords and associated words in reference to chief concerns relating to influenza.</p>
        </sec>
        <sec>
          <title>Using Web Query Recommendations</title>
          <p>Internet search users often require multiple iterations of query refinement to find the desired results from a search engine [<xref ref-type="bibr" rid="ref16">16</xref>]. Users of search engines can improve their Web search through the help of query recommendations that suggest lists of related queries, allowing users to improve the usability of Web search engines and to access queries that better represent their search intent [<xref ref-type="bibr" rid="ref17">17</xref>]. We considered queries suggested by keyword recommendations from the Korean websites Daum and Naver In this third phase, entering Flu into the search engines allowed us to identify 75 related queries in the form of single words or word combinations.</p>
        </sec>
      </sec>
      <sec>
        <title>Feature Selection and Prediction Model</title>
        <p>We divided the data into training and validation sets. Data from April 3, 2011 to June 29, 2013, were used as the training set for modeling and data from June 30, 2013 to April 5, 2014, were used as the validation set for the model test. Volumes of six seed queries and 146 related queries, obtained after duplicate queries were eliminated from the set of 216 candidate queries, were used for analysis. Before applying the algorithm to each dataset, all data were preprocessed by appropriate transformation and normalization methods. To identify optimal predictors, we applied a least absolute shrinkage and selection operator (Lasso) algorithm. Feature selection can be used to avoid overfitting of irrelevant features and to improve predictive performance (ie, resulting in more rapid and cost-effective predictions) [<xref ref-type="bibr" rid="ref19">19</xref>,<xref ref-type="bibr" rid="ref20">20</xref>]. The least absolute shrinkage and selection operator (Lasso) algorithm benefits from a tendency to assign zero weights to irrelevant or redundant features and, hence, is an effective technique for shrinkage and feature selection [<xref ref-type="bibr" rid="ref21">21</xref>]. Because we aimed to identify predictors of influenza epidemics, feature selection processing was performed at three time points (defined as lag -2, -1, and 0) on the training set portion of the influenza surveillance data using 10-fold cross-validation. We considered all optimal features selected in each lag for model building.</p>
        <p>Support vector machine for regression (SVR) was conducted to construct a model predicting influenza epidemics with selected features. Support vector machines, which are represented as one of the kernel-based methods in supervised machine learning, have been applied successfully to classification tasks and, more recently, also to regression [<xref ref-type="bibr" rid="ref22">22</xref>]. Grid search and 10-fold cross-validation were performed to select the optimal SVR parameter settings, including the penalty parameter <italic>C</italic> and the kernel function parameter such as the gamma for the radial basis function kernel. Ranges of values for grid search can be summarized as follows (elements in each list denote the beginning, end, and number of samples to generate, respectively): penalty parameter <italic>C</italic> (0.01, 10, 0.01); gamma (0.0001, 1, 0.0001). We assessed the root mean square error (RMSE), particular log errors, and the correlation between predicted values and influenza surveillance data using the validation set. All statistical analyses were performed using the R software package (version 3.0.3; R Development Core Team, Auckland, New Zealand).</p>
      </sec>
      <sec>
        <title>Ethics Statement</title>
        <p>This study was exempted from ethical review by the Institutional Review Board of Seoul National University.</p>
      </sec>
    </sec>
    <sec sec-type="results">
      <title>Results</title>
      <p>A total of 146 queries related to influenza were generated through our initial query selection approach (see <xref ref-type="app" rid="app1">Multimedia Appendix 1</xref>). Feature selection was performed based on 152 queries including six seed keywords, and optimal features for the prediction of influenza incidence were chosen using 10-fold cross-validation. <xref ref-type="table" rid="table1">Table 1</xref> presents the results of feature selection based on ILI surveillance data. Of the 152 queries, 15, 14, and 29 principal features (the total number of features without duplication=36) exhibited the minimum lambda value in lag-2, lag-1, and 0, respectively. The optimal features for the prediction of ILI incidence were derived from queries with reference to social media data (29/36 features), query recommendations (24/36 features), chief concerns relating to influenza (4/36 features), and seed keywords (1/36 features) (<xref ref-type="table" rid="table1">Table 1</xref>).</p>
      <p>We evaluated the performance of the prediction model, created on the basis of the training set for ILI surveillance, with the validation set. Our results indicated that the SVR model (<italic>C=</italic> 1.32; gamma=0.0002) performed well; the prediction values were highly correlated with recently observed ILI incidence rates (<italic>r</italic>=.956; <italic>P</italic>&#60;.001) (see <xref ref-type="fig" rid="figure1">Figure 1</xref>,<xref ref-type="app" rid="app2">Multimedia Appendix 2</xref> and <xref ref-type="app" rid="app4">Multimedia Appendix 4</xref>).</p>
      <p>We adopted the same principle with regard to the prediction of virological surveillance as we did with ILI. <xref ref-type="table" rid="table2">Table 2</xref> presents the results of feature selection based on virological surveillance data. Of the 152 queries, 28, 26, and 45 principal features (the total number of features without duplication=53) exhibited the minimum lambda value in lag-2, lag-1, and 0, respectively. The optimal features for the prediction of virological incidence were also derived from queries with reference to social media data (42/53), query recommendations (31/53), chief concerns relating to influenza (7/53), and seed keywords (1/53) (<xref ref-type="table" rid="table2">Table 2</xref>).</p>
      <p><xref ref-type="fig" rid="figure2">Figure 2</xref> shows the result of the performance of the prediction model for virological surveillance. The SVR model (<italic>C=</italic> 2.14; gamma=0.0006) performed well; the prediction values were highly correlated with recently observed virological incidence rates (<italic>r</italic>=.963; <italic>P</italic>&#60;.001) (see <xref ref-type="fig" rid="figure2">Figure 2</xref>,<xref ref-type="app" rid="app3">Multimedia Appendix 3</xref>, and <xref ref-type="app" rid="app4">Multimedia Appendix 4</xref>).</p>
      <table-wrap position="float" id="table1">
        <label>Table 1</label>
        <caption>
          <p>Optimal features for influenza-like illness surveillance.</p>
        </caption>
        <table width="625" cellpadding="7" cellspacing="0" border="1" rules="groups" frame="hsides">
          <col width="198"/>
          <col width="158"/>
          <col width="64"/>
          <col width="70"/>
          <col width="64"/>
          <thead>
          <tr valign="top">
            <td>Query</td>
            <td>Query reference</td>
            <td colspan="3">Coefficient</td>
          </tr>
          <tr valign="top">
            <td><break/></td>
            <td><break/></td>
            <td>Lag-2</td>
            <td>Lag-1</td>
            <td>Lag 0</td>
          </tr>  
          <tr valign="top">
            <td>(Intercept)</td>
            <td><break/></td>
            <td>0.332</td>
            <td>0.321</td>
            <td>0.497</td>
          </tr></thead>
          <tbody>
            <tr valign="top">
              <td>A hyeong influenza [influenza A type]</td>
              <td>Social media; query recommendation</td>
              <td>0.745</td>
              <td>0</td>
              <td>0.109</td>
            </tr>
            <tr valign="top">
              <td>A hyeong dokgam [influenza A type]</td>
              <td>Social media; query recommendation</td>
              <td>4.928</td>
              <td>20.154</td>
              <td>21.503</td>
            </tr>
            <tr valign="top">
              <td>A hyeong inpeulruenja [influenza A type]</td>
              <td>Social media; query recommendation</td>
              <td>0.065</td>
              <td>0.761</td>
              <td>1.127</td>
            </tr>
            <tr valign="top">
              <td>B hyeong influenza [influenza B type]</td>
              <td>Social media query; recommendation</td>
              <td>0</td>
              <td>0</td>
              <td>0.345</td>
            </tr>
            <tr valign="top">
              <td>B hyeong dokgam [influenza B type]</td>
              <td>Social media; query recommendation</td>
              <td>0</td>
              <td>0.029</td>
              <td>1.447</td>
            </tr>
            <tr valign="top">
              <td>Influenza A</td>
              <td>Social media; query recommendation</td>
              <td>2.345</td>
              <td>0.086</td>
              <td>0</td>
            </tr>
            <tr valign="top">
              <td>Influenza A hyeong [influenza A type]</td>
              <td>Social media; query recommendation</td>
              <td>1.894</td>
              <td>0.927</td>
              <td>0.029</td>
            </tr>
            <tr valign="top">
              <td>Vaccine</td>
              <td>Social media</td>
              <td>0</td>
              <td>0</td>
              <td>–0.1151</td>
            </tr>
            <tr valign="top">
              <td>Geongang [health]</td>
              <td>Social media</td>
              <td>0.393</td>
              <td>0.395</td>
              <td>0.109</td>
            </tr>
            <tr valign="top">
              <td>Dokgamgamyeom [flu infection]</td>
              <td>Social media</td>
              <td>0.052</td>
              <td>0</td>
              <td>0</td>
            </tr>
            <tr valign="top">
              <td>Dokgamgeomsa [flu check]</td>
              <td>Social media; query recommendation</td>
              <td>4.303</td>
              <td>8.893</td>
              <td>4.402</td>
            </tr>
            <tr valign="top">
              <td>Dokgam gyeokrigigan [flu isolation period]</td>
              <td>Query recommendation</td>
              <td>0</td>
              <td>0</td>
              <td>0.177</td>
            </tr>
            <tr valign="top">
              <td>Dokgam gichim [flu cough]</td>
              <td>Social media; chief concern</td>
              <td>0</td>
              <td>0</td>
              <td>1.106</td>
            </tr>
            <tr valign="top">
              <td>Dokgam baireoseu [flu virus]</td>
              <td>Social media; query recommendation</td>
              <td>0</td>
              <td>0</td>
              <td>–0.220</td>
            </tr>
            <tr valign="top">
              <td>Dokgam yeol [flu fever]</td>
              <td>Chief concern</td>
              <td>0.391</td>
              <td>0</td>
              <td>0</td>
            </tr>
            <tr valign="top">
              <td>Dokgam yebang [flu prevention]</td>
              <td>Social media; query recommendation</td>
              <td>0</td>
              <td>0</td>
              <td>–0.152</td>
            </tr>
            <tr valign="top">
              <td>Dokgam yebangjeopjong [flu vaccination]</td>
              <td>Social media; query recommendation</td>
              <td>0</td>
              <td>0</td>
              <td>–0.1174</td>
            </tr>
            <tr valign="top">
              <td>Dokgam ipwon [flu hospitalization]</td>
              <td>Social media; query recommendation</td>
              <td>0</td>
              <td>0</td>
              <td>1.470</td>
            </tr>
            <tr valign="top">
              <td>Dokgam jeonyeom [flu infection]</td>
              <td>Social media; query recommendation</td>
              <td>0</td>
              <td>0</td>
              <td>2.569</td>
            </tr>
            <tr valign="top">
              <td>Dokgam jeonpa [flu dissemination]</td>
              <td>Social media; query recommendation</td>
              <td>0.547</td>
              <td>0.322</td>
              <td>0.017</td>
            </tr>
            <tr valign="top">
              <td>Dokgam pyeryeom [flu pneumonia]</td>
              <td>Social media ; chief concern</td>
              <td>0</td>
              <td>0</td>
              <td>0.005</td>
            </tr>
            <tr valign="top">
              <td>Dokgam hakgyo [flu school]</td>
              <td>Social media</td>
              <td>0</td>
              <td>0.122</td>
              <td>0</td>
            </tr>
            <tr valign="top">
              <td>Dokgam hwanja [flu patient]</td>
              <td>Social media</td>
              <td>0.066</td>
              <td>0</td>
              <td>0</td>
            </tr>
            <tr valign="top">
              <td>Soa dokgamjeungsang [child flu symptoms]</td>
              <td>Query recommendation</td>
              <td>0.811</td>
              <td>0.323</td>
              <td>0.135</td>
            </tr>
            <tr valign="top">
              <td>Sinjongpeulru jeungsang [new flu symptoms]</td>
              <td>Social media; query recommendation</td>
              <td>55.980</td>
              <td>46.156</td>
              <td>58.415</td>
            </tr>
            <tr valign="top">
              <td>Simhangamgi [severe cold]</td>
              <td>Social media</td>
              <td>0</td>
              <td>0</td>
              <td>0.031</td>
            </tr>
            <tr valign="top">
              <td>Eorini dokgamyuhaeng [child flu epidemic]</td>
              <td>Query recommendation</td>
              <td>0</td>
              <td>0</td>
              <td>0.002</td>
            </tr>
            <tr valign="top">
              <td>Onmomi apeum [whole body pain]</td>
              <td>Chief concern</td>
              <td>0</td>
              <td>0.038</td>
              <td>0.072</td>
            </tr>
            <tr valign="top">
              <td>Inpeulruenja geomsa [influenza check]</td>
              <td>Social media; query recommendation</td>
              <td>0</td>
              <td>0.233</td>
              <td>0</td>
            </tr>
            <tr valign="top">
              <td>Inpeulruenja yak [influenza medicine]</td>
              <td>Social media; query recommendation</td>
              <td>0</td>
              <td>0</td>
              <td>–0.005</td>
            </tr>
            <tr valign="top">
              <td>Inpeulruenja yuhaeng [influenza epidemic]</td>
              <td>Social media</td>
              <td>0</td>
              <td>0</td>
              <td>0.003</td>
            </tr>
            <tr valign="top">
              <td>Inpeulruenja jeungsang [influenza symptoms]</td>
              <td>Social media; query recommendation</td>
              <td>6.254</td>
              <td>0</td>
              <td>0</td>
            </tr>
            <tr valign="top">
              <td>Inpeulruenja jeungse [influenza symptoms]</td>
              <td>Social media; query recommendation</td>
              <td>0</td>
              <td>0</td>
              <td>0.209</td>
            </tr>
            <tr valign="top">
              <td>Junggukdokgam [China influenza]</td>
              <td>Query recommendation</td>
              <td>0</td>
              <td>0</td>
              <td>–0.056</td>
            </tr>
            <tr valign="top">
              <td>Tamipeulru [Tamiflu]</td>
              <td>Social media: query recommendation</td>
              <td>0</td>
              <td>0</td>
              <td>0.517</td>
            </tr>
            <tr valign="top">
              <td>Peulru [flu]</td>
              <td>Seed keyword</td>
              <td>0.621</td>
              <td>0.562</td>
              <td>0.339</td>
            </tr>
          </tbody>
        </table>
      </table-wrap>
      <table-wrap position="float" id="table2">
        <label>Table 2</label>
        <caption>
          <p>Optimal features for virological surveillance.</p>
        </caption>
        <table width="625" cellpadding="7" cellspacing="0" border="1" rules="groups" frame="hsides">
          <col width="198"/>
          <col width="158"/>
          <col width="64"/>
          <col width="70"/>
          <col width="64"/>
          <thead>
            <tr valign="top">
              <td>Query</td>
              <td>Query reference</td>
              <td colspan="3">Coefficient</td>
            </tr>
            <tr valign="top">
              <td><break/></td>
              <td><break/></td>
              <td>Lag-2</td>
              <td>Lag-1</td>
              <td>Lag 0</td>
            </tr>
            <tr valign="top">
              <td>(Intercept)</td>
              <td><break/></td>
              <td>–1.459</td>
              <td>–3.124</td>
              <td>–2.147</td>
            </tr>
          </thead>
          <tbody>
            <tr valign="top">
              <td>A hyeong influenza [influenza A type]</td>
              <td>Social media; query recommendation</td>
              <td>26.413</td>
              <td>18.899</td>
              <td>22.579</td>
            </tr>
            <tr valign="top">
              <td>A hyeong dokgam [ influenza A type]</td>
              <td>Social media; query recommendation</td>
              <td>0</td>
              <td>0</td>
              <td>379.041</td>
            </tr>
            <tr valign="top">
              <td>B hyeong dokgam [ influenza B type]</td>
              <td>Social media; query recommendation</td>
              <td>6.007</td>
              <td>15.324</td>
              <td>24.039</td>
            </tr>
            <tr valign="top">
              <td>B hyeong dokgamjeungsang [ symptoms of influenza B type]</td>
              <td>Social media; query recommendation</td>
              <td>0</td>
              <td>0</td>
              <td>0.229</td>
            </tr>
            <tr valign="top">
              <td>Influenza A</td>
              <td>Social media; query recommendation</td>
              <td>37.953</td>
              <td>25.021</td>
              <td>17.449</td>
            </tr>
            <tr valign="top">
              <td>Influenza ahyeong [influenza A type]</td>
              <td>Social media; query recommendation</td>
              <td>24.114</td>
              <td>19.342</td>
              <td>11.426</td>
            </tr>
            <tr valign="top">
              <td>Gamgibaireoseu [cold virus]</td>
              <td>Social media</td>
              <td>0</td>
              <td>0</td>
              <td>4.898</td>
            </tr>
            <tr valign="top">
              <td>Gamgi pparrinatneunbeop [how to cure flu quickly]</td>
              <td>Query recommendation</td>
              <td>5.365</td>
              <td>4.262</td>
              <td>2.343</td>
            </tr>
            <tr valign="top">
              <td>Gamgiyebang [cold prevention]</td>
              <td>Social media; query recommendation</td>
              <td>0</td>
              <td>0</td>
              <td>–0.450</td>
            </tr>
            <tr valign="top">
              <td>Gamgiyebangbeop [how to prevent a cold]</td>
              <td>Social media</td>
              <td>–0.155</td>
              <td>–2.736</td>
              <td>–4.140</td>
            </tr>
            <tr valign="top">
              <td>Geongang [health]</td>
              <td>Social media</td>
              <td>4.091</td>
              <td>3.562</td>
              <td>3.390</td>
            </tr>
            <tr valign="top">
              <td>Geunyuktong [muscle pain]</td>
              <td>Social media; chief concern</td>
              <td>0</td>
              <td>0</td>
              <td>–0.265</td>
            </tr>
            <tr valign="top">
              <td>Nalssi [weather]</td>
              <td>Social media</td>
              <td>0</td>
              <td>0</td>
              <td>–0.111</td>
            </tr>
            <tr valign="top">
              <td>Dokgam ahyeong [flu A type]</td>
              <td>Social media; query recommendation</td>
              <td>0</td>
              <td>0</td>
              <td>22.772</td>
            </tr>
            <tr valign="top">
              <td>Dokgamgamyeom [flu infection]</td>
              <td>Social media</td>
              <td>12.236</td>
              <td>1.449</td>
              <td>0</td>
            </tr>
            <tr valign="top">
              <td>Dokgamgeomsa [flu check]</td>
              <td>Social media; query recommendation</td>
              <td>38.254</td>
              <td>31.878</td>
              <td>0</td>
            </tr>
            <tr valign="top">
              <td>Dokgam gyeokrigigan [flu isolation period]</td>
              <td>Query recommendation</td>
              <td>0</td>
              <td>0</td>
              <td>12.145</td>
            </tr>
            <tr valign="top">
              <td>Dokgam goyeol [flu high fever]</td>
              <td>Social media; chief concern</td>
              <td>0</td>
              <td>0</td>
              <td>1.745</td>
            </tr>
            <tr valign="top">
              <td>Dokgam gichim [flu cough]</td>
              <td>Social media; chief concern</td>
              <td>0</td>
              <td>0</td>
              <td>25.911</td>
            </tr>
            <tr valign="top">
              <td>Dokgam noin [flu in the elderly]</td>
              <td>Social media</td>
              <td>0</td>
              <td>0</td>
              <td>–3.739</td>
            </tr>
            <tr valign="top">
              <td>Dokgam baireoseu [flu virus]</td>
              <td>Social media; query recommendation</td>
              <td>0</td>
              <td>0</td>
              <td>–0.777</td>
            </tr>
            <tr valign="top">
              <td>Dokgam i [flu child]</td>
              <td>Social media</td>
              <td>0</td>
              <td>0</td>
              <td>2.694</td>
            </tr>
            <tr valign="top">
              <td>Dokgam eorini [flu child]</td>
              <td>Social media</td>
              <td>0</td>
              <td>0</td>
              <td>–0.477</td>
            </tr>
            <tr valign="top">
              <td>Dokgam yebang [flu prevention]</td>
              <td>Social media; query recommendation</td>
              <td>–2.467</td>
              <td>–9.760</td>
              <td>–12.191</td>
            </tr>
            <tr valign="top">
              <td>Dokgam yebanghaneunbangbeop [how to prevent flu]</td>
              <td>Query recommendation</td>
              <td>0</td>
              <td>0</td>
              <td>–0.638</td>
            </tr>
            <tr valign="top">
              <td>Dokgam yuhaeng [flu epidemic]</td>
              <td>Social media; query recommendation</td>
              <td>0</td>
              <td>0</td>
              <td>–0.109</td>
            </tr>
            <tr valign="top">
              <td>Dokgam ipwon [flu hospitalization]</td>
              <td>Social media; query recommendation</td>
              <td>8.156</td>
              <td>0</td>
              <td>13.793</td>
            </tr>
            <tr valign="top">
              <td>Dokgam jeonyeom [flu infection]</td>
              <td>Social media; query recommendation</td>
              <td>38.184</td>
              <td>81.830</td>
              <td>9.762</td>
            </tr>
            <tr valign="top">
              <td>Dokgam jeonpa [flu dissemination]</td>
              <td>Social media; query recommendation</td>
              <td>2.596</td>
              <td>5.613</td>
              <td>3.973</td>
            </tr>
            <tr valign="top">
              <td>Dokgamjusa [flu injection]</td>
              <td>Social media; query recommendation</td>
              <td>–3.907</td>
              <td>0</td>
              <td>0</td>
            </tr>
            <tr valign="top">
              <td>Dokgamjuuibo [flu watch]</td>
              <td>Query recommendation</td>
              <td>0.883</td>
              <td>0.310</td>
              <td>0</td>
            </tr>
            <tr valign="top">
              <td>Dokgam hakgyo [flu school]</td>
              <td>Social media</td>
              <td>9.268</td>
              <td>0</td>
              <td>0</td>
            </tr>
            <tr valign="top">
              <td>Dokgam hapbyeongjeung [flu complication]</td>
              <td>Social media</td>
              <td>0</td>
              <td>0</td>
              <td>3.513</td>
            </tr>
            <tr valign="top">
              <td>Dokgamhwanja [flu patient]</td>
              <td>Social media</td>
              <td>7.024</td>
              <td>5.027</td>
              <td>3.205</td>
            </tr>
            <tr valign="top">
              <td>Dwaejidokgam [swine flu]</td>
              <td>Query recommendation</td>
              <td>0.358</td>
              <td>0</td>
              <td>0</td>
            </tr>
            <tr valign="top">
              <td>Maseukeu [mask]</td>
              <td>Social media</td>
              <td>8.053</td>
              <td>0</td>
              <td>0</td>
            </tr>
            <tr valign="top">
              <td>Momsal [body aches]</td>
              <td>Social media; chief concern</td>
              <td>0</td>
              <td>1.387</td>
              <td>3.912</td>
            </tr>
            <tr valign="top">
              <td>Soa dokgam jeungsang [child flu symptoms]</td>
              <td>Query recommendation</td>
              <td>4.737</td>
              <td>8.058</td>
              <td>9.041</td>
            </tr>
            <tr valign="top">
              <td>Adong dokgam jeungsang [child flu epidemic]</td>
              <td>Social media; query recommendation</td>
              <td>0</td>
              <td>0</td>
              <td>–5.273</td>
            </tr>
            <tr valign="top">
              <td>Eoreun dokgam jeungsang [adult flu symptoms]</td>
              <td>Query recommendation</td>
              <td>5.156</td>
              <td>1.485</td>
              <td>0.610</td>
            </tr>
            <tr valign="top">
              <td>Eolgultongjeung [face pain]</td>
              <td>Chief concern</td>
              <td>–1.057</td>
              <td>0</td>
              <td>0</td>
            </tr>
            <tr valign="top">
              <td>Onmomi apeum [whole body pain]</td>
              <td>Chief concern</td>
              <td>2.962</td>
              <td>3.725</td>
              <td>4.791</td>
            </tr>
            <tr valign="top">
              <td>Uisa [doctor]</td>
              <td>Social media</td>
              <td>–3.153</td>
              <td>–0.436</td>
              <td>–0.712</td>
            </tr>
            <tr valign="top">
              <td>inpeulruenja ahyeong [influenza A type]</td>
              <td>Social media; query recommendation</td>
              <td>0</td>
              <td>8.349</td>
              <td>5.837</td>
            </tr>
            <tr valign="top">
              <td>Inpeulruenja samang [influenza death]</td>
              <td>Social media; query recommendation</td>
              <td>0</td>
              <td>–0.363</td>
              <td>–5.193</td>
            </tr>
            <tr valign="top">
              <td>Inpeulruenja yak [influenza medicine]</td>
              <td>Social media; query recommendation</td>
              <td>0</td>
              <td>0</td>
              <td>–0.560</td>
            </tr>
            <tr valign="top">
              <td>Inpeulruenja jeungse [influenza symptoms]</td>
              <td>Social media; query recommendation</td>
              <td>3.039</td>
              <td>2.051</td>
              <td>5.303</td>
            </tr>
            <tr valign="top">
              <td>Ipwon [hospitalization]</td>
              <td>Social media</td>
              <td>0</td>
              <td>0</td>
              <td>–0.213</td>
            </tr>
            <tr valign="top">
              <td>Joryudokgam [avian flu]</td>
              <td>Query recommendation</td>
              <td>3.972</td>
              <td>4.239</td>
              <td>3.492</td>
            </tr>
            <tr valign="top">
              <td>Tamipeulru [Tamiflu]</td>
              <td>Social media; query recommendation</td>
              <td>0</td>
              <td>65.618</td>
              <td>75.462</td>
            </tr>
            <tr valign="top">
              <td>Pyeryeom [pneumonia]</td>
              <td>Social media; query recommendation; chief concern</td>
              <td>0</td>
              <td>0</td>
              <td>–1.288</td>
            </tr>
            <tr valign="top">
              <td>Peulru [flu]</td>
              <td>Seed keyword</td>
              <td>15.992</td>
              <td>13.406</td>
              <td>5.924</td>
            </tr>
            <tr valign="top">
              <td>Hwanja [patient]</td>
              <td>Social media</td>
              <td>–4.543</td>
              <td>–3.170</td>
              <td>–2.922</td>
            </tr>
          </tbody>
        </table>
      </table-wrap>
      <fig id="figure1" position="float">
        <label>Figure 1</label>
        <caption>
          <p>Support vector machine for regression(SVR) prediction and error for influenza-like illness (ILI) surveillance in Korea. This figure shows the performance of the SVR model using the validation set of KCDC surveillance data to predict the next observation. Note: log error=log([obs–exp]2/abs[exp]).</p>
        </caption>
        <graphic xlink:href="jmir_v18i7e177_fig1.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
      </fig>
      <fig id="figure2" position="float">
        <label>Figure 2</label>
        <caption>
          <p>Support vector machine for regression (SVR) prediction and error for virological surveillance in Korea. This figure shows the performance of the SVR model using the validation set of KCDC surveillance data to predict the next observation. Note: log error=log([obs–exp]2/abs[exp]); VIR: virological positive rate.</p>
        </caption>
        <graphic xlink:href="jmir_v18i7e177_fig2.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
      </fig>
    </sec>
    <sec sec-type="discussion">
      <title>Discussion</title>
      <p>This study investigated whether search queries have the capacity to enhance the traditional influenza surveillance system in South Korea. To select queries most likely to be associated with influenza epidemics, we adopted an approach that explored contextual information available in social media data. A considerable proportion of optimal features for our final models were derived from queries with reference to the social media data. Our best model for South Korean ILI data included 36 queries and was highly correlated with observed ILI incidence rates. Our model for virological data, which included 53 queries generated through the same principles as the ILI model, performed equally well in terms of its correlation with observed virological incidence rates. Hence, our models for detecting national influenza incidence have the power to monitor changes. These results demonstrate the feasibility of search queries in enhancing influenza surveillance in South Korea.</p>
      <p>Created to predict the incidence of influenza throughout the year, including during high- and low-incidence seasons, our model performed as well as previous models that had benefited from full access to search logs to predict influenza incidence using search queries [<xref ref-type="bibr" rid="ref3">3</xref>,<xref ref-type="bibr" rid="ref4">4</xref>,<xref ref-type="bibr" rid="ref6">6</xref>]. Researchers who do not have full access to search logs need to choose the most pertinent queries, but these may be difficult to determine [<xref ref-type="bibr" rid="ref1">1</xref>]. Our current approach for query selection using social media data appears to be ideal for supporting influenza surveillance based on search query data. First, it may be helpful for obtaining information for query selection because they contain a greater variety of contextual health information, with diverse descriptions of health states. Above all, it may be a more efficient and unobtrusive way to gather health information. Second, an approach using social media data offers clues for understanding such predictors and their weight, which may vary over time. In generating a prediction model using search query data, it is important to note that search queries change over time. An individual’s search behavior changes constantly and keywords submitted by individuals may be influenced by numerous factors, such as media-driven interest or various events [<xref ref-type="bibr" rid="ref5">5</xref>,<xref ref-type="bibr" rid="ref23">23</xref>,<xref ref-type="bibr" rid="ref24">24</xref>]. These changes alter or degrade the performance of search query-based surveillance. The recent Google Flu Trends overestimation can also be understood in the same context [<xref ref-type="bibr" rid="ref7">7</xref>,<xref ref-type="bibr" rid="ref8">8</xref>]. Constructing a model that is flexible over time is probably the most difficult, but also the most important, task to complete in the future creation of robust surveillance systems. The systematic exploration of changing predictors in social media data may help to update models based on search queries within a statistical learning framework.</p>
      <p>Internet usage is strongly associated with behaviors related to health information seeking and sharing. Some users write expositions about their health through various social media channels, such as blogs and Twitter, while some users leave query logs of health-related questions on the Internet search engines of websites. These types of activities may provide complementary information; it is likely that social media data contain diverse descriptions of personal experiences and information, whereas search engine query data specifically relate to queries, which are submitted for the sole purpose of obtaining information. Starting with studies that have exploited search trends, suggested first in 2006 [<xref ref-type="bibr" rid="ref25">25</xref>], the notion of detecting influenza activity using Internet-based data has been extended to experimentation with social media data [<xref ref-type="bibr" rid="ref25">25</xref>]. Thus far, several studies have tried to separately evaluate the scientific potential of each type of novel data for detecting emerging influenza incidence. Although previous empirical studies have reported some significant results, this domain of inquiry is still very much in its infancy [<xref ref-type="bibr" rid="ref5">5</xref>,<xref ref-type="bibr" rid="ref23">23</xref>,<xref ref-type="bibr" rid="ref24">24</xref>] and several limitations pertaining to data sources can be identified [<xref ref-type="bibr" rid="ref7">7</xref>,<xref ref-type="bibr" rid="ref8">8</xref>]. Beyond simply conducting experiments to replicate the findings of previous studies using each type of novel data, perhaps it is time to consider a new strategy, one that adopts mutually reinforcing measures of the valuable information contained in each type of data.</p>
      <p>We have used query data obtained from Daum, a Korean local website. The market share of Daum is only 17.4% despite being the second-largest search engine in South Korea; nevertheless, our prediction exhibited strong congruence with national ILI incidence rates. Previous research using query data from Daum has found that some cumulative queries selected by means of survey were also strongly correlated with national influenza surveillance data in South Korea between September 6, 2009 and September 1, 2012 [<xref ref-type="bibr" rid="ref9">9</xref>]. The findings jointly suggest the possibility of developing an influenza surveillance system using a nondominant search engine.</p>
      <p>However, changes in Internet usage rates and health information seeking rates may constitute a somewhat central limitation on the use of search query data. Noise from irrelevant information and uncertainty regarding the representativeness of the sample of health information seekers are also significant limitations. These limitations exist in the data used in our study; thus, optimal features of our model may need to be updated over time.</p>
      <p>The initial days of an epidemic represent a critical period for health authorities in terms of initiating appropriate interventions. An online surveillance system allows for cost-effective and near real-time monitoring of infectious disease outbreaks through rapid data collection.</p>
      <p>Despite several limitations, this study provides further evidence, based on a new approach, for linkages between the use of Internet-based data and the surveillance of emerging influenza incidence in South Korea. We found that Internet-based influenza surveillance that combines search engine query data with social media data has the power to detect influenza outbreaks, exhibiting strong congruence with traditional surveillance data. Such an approach may provide valuable support in preparing for severe pandemics, such as the 2009 influenza A (H1N1) pandemic, and in controlling seasonal influenza epidemics. Furthermore, in an attempt to exploit the complementary nature of two types of data sources, in this study we fused information drawn from social media with a methodology for query-based influenza surveillance. Our results imply that these new data sources can be compatible and complementary in predicting influenza incidence. Our approach indicates that an online surveillance system can play a significant role in detecting infectious diseases such as influenza in near real time before the release of official reports in South Korea.</p>
    </sec>
  </body>
  <back>
    <app-group>
      <app id="app1">
        <title>Multimedia Appendix 1</title>
        <p>Queries related to influenza generated by an initial query selection approach.</p>
        <media xlink:href="jmir_v18i7e177_app1.pdf" xlink:title="PDF File (Adobe PDF File), 100KB"/>
      </app>
      <app id="app2">
        <title>Multimedia Appendix 2</title>
        <p>Support vector machine for regression(SVR) prediction and error for influenza-like illness(ILI) surveillance in Korea.</p>
        <media xlink:href="jmir_v18i7e177_app2.png" xlink:title="PNG File, 799KB"/>
      </app>
      <app id="app3">
        <title>Multimedia Appendix 3</title>
        <p>Support vector machine for regression(SVR) prediction and error for virological surveillance in Korea.</p>
        <media xlink:href="jmir_v18i7e177_app3.png" xlink:title="PNG File, 803KB"/>
      </app>
      <app id="app4">
        <title>Multimedia Appendix 4</title>
        <p>Search for the optimal final model.</p>
        <media xlink:href="jmir_v18i7e177_app4.pdf" xlink:title="PDF File (Adobe PDF File), 68KB"/>
      </app>
    </app-group>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">API</term>
          <def>
            <p>application program interface</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb2">ILI</term>
          <def>
            <p>influenza-like illness</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb3">KCDC</term>
          <def>
            <p>Korea Center for Disease Control and Prevention</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb4">Lasso</term>
          <def>
            <p>least absolute shrinkage and selection operator</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb5">RMSE</term>
          <def>
            <p>root mean square error</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb6">SVR</term>
          <def>
            <p>support vector machine for regression</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <ack>
      <p>This study was supported by the Brain Fusion Research Grant from Seoul National University. The Funder had no role in the study design, data collection and analysis, or preparation of the manuscript. The opinions, results, and conclusions reported in this paper are those of the authors and are independent of the funding source.</p>
    </ack>
    <fn-group>
      <fn fn-type="con">
        <p>Conceived and designed the experiment: H Woo, Y Cho, E Shim; performed the experiment: Y Cho, H Woo, E Shim, J Lee, C Lee; analyzed the data: H Woo; contributed materials: S Kim; wrote the paper: H Woo, Y Cho.</p>
      </fn>
      <fn fn-type="conflict">
        <p>None declared.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Cho</surname>
            <given-names>S</given-names>
          </name>
          <name name-style="western">
            <surname>Sohn</surname>
            <given-names>CH</given-names>
          </name>
          <name name-style="western">
            <surname>Jo</surname>
            <given-names>MW</given-names>
          </name>
          <name name-style="western">
            <surname>Shin</surname>
            <given-names>S</given-names>
          </name>
          <name name-style="western">
            <surname>Lee</surname>
            <given-names>JH</given-names>
          </name>
          <name name-style="western">
            <surname>Ryoo</surname>
            <given-names>SM</given-names>
          </name>
          <name name-style="western">
            <surname>Kim</surname>
            <given-names>WY</given-names>
          </name>
          <name name-style="western">
            <surname>Seo</surname>
            <given-names>D</given-names>
          </name>
        </person-group>
        <article-title>Correlation between national influenza surveillance data and google trends in South Korea</article-title>
        <source>PLoS One</source>  
        <year>2013</year>  
        <volume>8</volume>  
        <issue>12</issue>  
        <fpage>e81422</fpage>  
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://dx.plos.org/10.1371/journal.pone.0081422"/>
        </comment>  
        <pub-id pub-id-type="doi">10.1371/journal.pone.0081422</pub-id>
        <pub-id pub-id-type="medline">24339927</pub-id>
        <pub-id pub-id-type="pii">PONE-D-13-24884</pub-id>
        <pub-id pub-id-type="pmcid">PMC3855287</pub-id></nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Cook</surname>
            <given-names>S</given-names>
          </name>
          <name name-style="western">
            <surname>Conrad</surname>
            <given-names>C</given-names>
          </name>
          <name name-style="western">
            <surname>Fowlkes</surname>
            <given-names>AL</given-names>
          </name>
          <name name-style="western">
            <surname>Mohebbi</surname>
            <given-names>MH</given-names>
          </name>
        </person-group>
        <article-title>Assessing Google flu trends performance in the United States during the 2009 influenza virus A (H1N1) pandemic</article-title>
        <source>PLoS One</source>  
        <year>2011</year>  
        <month>8</month>  
        <volume>6</volume>  
        <issue>8</issue>  
        <fpage>e23610</fpage>  
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://dx.plos.org/10.1371/journal.pone.0023610"/>
        </comment>  
        <pub-id pub-id-type="doi">10.1371/journal.pone.0023610</pub-id>
        <pub-id pub-id-type="medline">21886802</pub-id>
        <pub-id pub-id-type="pii">PONE-D-11-06712</pub-id>
        <pub-id pub-id-type="pmcid">PMC3158788</pub-id></nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Ginsberg</surname>
            <given-names>J</given-names>
          </name>
          <name name-style="western">
            <surname>Mohebbi</surname>
            <given-names>MH</given-names>
          </name>
          <name name-style="western">
            <surname>Patel</surname>
            <given-names>RS</given-names>
          </name>
          <name name-style="western">
            <surname>Brammer</surname>
            <given-names>L</given-names>
          </name>
          <name name-style="western">
            <surname>Smolinski</surname>
            <given-names>MS</given-names>
          </name>
          <name name-style="western">
            <surname>Brilliant</surname>
            <given-names>L</given-names>
          </name>
        </person-group>
        <article-title>Detecting influenza epidemics using search engine query data</article-title>
        <source>Nature</source>  
        <year>2009</year>  
        <month>02</month>  
        <day>19</day>  
        <volume>457</volume>  
        <issue>7232</issue>  
        <fpage>1012</fpage>  
        <lpage>1014</lpage>  
        <pub-id pub-id-type="doi">10.1038/nature07634</pub-id>
        <pub-id pub-id-type="medline">19020500</pub-id>
        <pub-id pub-id-type="pii">nature07634</pub-id></nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Polgreen</surname>
            <given-names>PM</given-names>
          </name>
          <name name-style="western">
            <surname>Chen</surname>
            <given-names>Y</given-names>
          </name>
          <name name-style="western">
            <surname>Pennock</surname>
            <given-names>DM</given-names>
          </name>
          <name name-style="western">
            <surname>Nelson</surname>
            <given-names>FD</given-names>
          </name>
        </person-group>
        <article-title>Using internet searches for influenza surveillance</article-title>
        <source>Clin Infect Dis</source>  
        <year>2008</year>  
        <month>12</month>  
        <day>1</day>  
        <volume>47</volume>  
        <issue>11</issue>  
        <fpage>1443</fpage>  
        <lpage>1448</lpage>  
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://www.cid.oxfordjournals.org/cgi/pmidlookup?view=long&#38;pmid=18954267"/>
        </comment>  
        <pub-id pub-id-type="doi">10.1086/593098</pub-id>
        <pub-id pub-id-type="medline">18954267</pub-id></nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Yuan</surname>
            <given-names>Q</given-names>
          </name>
          <name name-style="western">
            <surname>Nsoesie</surname>
            <given-names>EO</given-names>
          </name>
          <name name-style="western">
            <surname>Lv</surname>
            <given-names>B</given-names>
          </name>
          <name name-style="western">
            <surname>Peng</surname>
            <given-names>G</given-names>
          </name>
          <name name-style="western">
            <surname>Chunara</surname>
            <given-names>R</given-names>
          </name>
          <name name-style="western">
            <surname>Brownstein</surname>
            <given-names>JS</given-names>
          </name>
        </person-group>
        <article-title>Monitoring influenza epidemics in china with search query from baidu</article-title>
        <source>PLoS One</source>  
        <year>2013</year>  
        <month>5</month>  
        <volume>8</volume>  
        <issue>5</issue>  
        <fpage>e64323</fpage>  
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://dx.plos.org/10.1371/journal.pone.0064323"/>
        </comment>  
        <pub-id pub-id-type="doi">10.1371/journal.pone.0064323</pub-id>
        <pub-id pub-id-type="medline">23750192</pub-id>
        <pub-id pub-id-type="pii">PONE-D-13-00331</pub-id>
        <pub-id pub-id-type="pmcid">PMC3667820</pub-id></nlm-citation>
      </ref>
      <ref id="ref6">
        <label>6</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Hulth</surname>
            <given-names>A</given-names>
          </name>
          <name name-style="western">
            <surname>Rydevik</surname>
            <given-names>G</given-names>
          </name>
          <name name-style="western">
            <surname>Linde</surname>
            <given-names>A</given-names>
          </name>
        </person-group>
        <article-title>Web queries as a source for syndromic surveillance</article-title>
        <source>PLoS One</source>  
        <year>2009</year>  
        <volume>4</volume>  
        <issue>2</issue>  
        <fpage>e4378</fpage>  
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://dx.plos.org/10.1371/journal.pone.0004378"/>
        </comment>  
        <pub-id pub-id-type="doi">10.1371/journal.pone.0004378</pub-id>
        <pub-id pub-id-type="medline">19197389</pub-id>
        <pub-id pub-id-type="pmcid">PMC2634970</pub-id></nlm-citation>
      </ref>
      <ref id="ref7">
        <label>7</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Lazer</surname>
            <given-names>D</given-names>
          </name>
          <name name-style="western">
            <surname>Kennedy</surname>
            <given-names>R</given-names>
          </name>
          <name name-style="western">
            <surname>King</surname>
            <given-names>G</given-names>
          </name>
          <name name-style="western">
            <surname>Vespignani</surname>
            <given-names>A</given-names>
          </name>
        </person-group>
        <article-title>Big data. The parable of Google Flu: traps in big data analysis</article-title>
        <source>Science</source>  
        <year>2014</year>  
        <month>03</month>  
        <day>14</day>  
        <volume>343</volume>  
        <issue>6176</issue>  
        <fpage>1203</fpage>  
        <lpage>1205</lpage>  
        <pub-id pub-id-type="doi">10.1126/science.1248506</pub-id>
        <pub-id pub-id-type="medline">24626916</pub-id>
        <pub-id pub-id-type="pii">343/6176/1203</pub-id></nlm-citation>
      </ref>
      <ref id="ref8">
        <label>8</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Lazer</surname>
            <given-names>D</given-names>
          </name>
          <name name-style="western">
            <surname>Kennedy</surname>
            <given-names>R</given-names>
          </name>
          <name name-style="western">
            <surname>King</surname>
            <given-names>G</given-names>
          </name>
          <name name-style="western">
            <surname>Vespignani</surname>
            <given-names>A</given-names>
          </name>
        </person-group>
        <article-title>Twitter: big data opportunities--response</article-title>
        <source>Science</source>  
        <year>2014</year>  
        <volume>345</volume>  
        <issue>6193</issue>  
        <fpage>148</fpage>  
        <lpage>149</lpage>  
        <pub-id pub-id-type="medline">25013053</pub-id></nlm-citation>
      </ref>
      <ref id="ref9">
        <label>9</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Seo</surname>
            <given-names>DW</given-names>
          </name>
          <name name-style="western">
            <surname>Jo</surname>
            <given-names>MW</given-names>
          </name>
          <name name-style="western">
            <surname>Sohn</surname>
            <given-names>CH</given-names>
          </name>
          <name name-style="western">
            <surname>Shin</surname>
            <given-names>SY</given-names>
          </name>
          <name name-style="western">
            <surname>Lee</surname>
            <given-names>JH</given-names>
          </name>
          <name name-style="western">
            <surname>Yu</surname>
            <given-names>M</given-names>
          </name>
          <name name-style="western">
            <surname>Kim</surname>
            <given-names>WY</given-names>
          </name>
          <name name-style="western">
            <surname>Lim</surname>
            <given-names>KS</given-names>
          </name>
          <name name-style="western">
            <surname>Lee</surname>
            <given-names>SI</given-names>
          </name>
        </person-group>
        <article-title>Cumulative query method for influenza surveillance using search engine data</article-title>
        <source>J Med Internet Res</source>  
        <year>2014</year>  
        <month>12</month>  
        <volume>16</volume>  
        <issue>12</issue>  
        <fpage>e289</fpage>  
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://www.jmir.org/2014/12/e289/"/>
        </comment>  
        <pub-id pub-id-type="doi">10.2196/jmir.3680</pub-id>
        <pub-id pub-id-type="medline">25517353</pub-id>
        <pub-id pub-id-type="pii">v16i12e289</pub-id>
        <pub-id pub-id-type="pmcid">PMC4275481</pub-id></nlm-citation>
      </ref>
      <ref id="ref10">
        <label>10</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Kang</surname>
            <given-names>M</given-names>
          </name>
          <name name-style="western">
            <surname>Zhong</surname>
            <given-names>H</given-names>
          </name>
          <name name-style="western">
            <surname>He</surname>
            <given-names>J</given-names>
          </name>
          <name name-style="western">
            <surname>Rutherford</surname>
            <given-names>S</given-names>
          </name>
          <name name-style="western">
            <surname>Yang</surname>
            <given-names>F</given-names>
          </name>
        </person-group>
        <article-title>Using Google Trends for influenza surveillance in South China</article-title>
        <source>PLoS One</source>  
        <year>2013</year>  
        <month>1</month>  
        <volume>8</volume>  
        <issue>1</issue>  
        <fpage>e55205</fpage>  
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://dx.plos.org/10.1371/journal.pone.0055205"/>
        </comment>  
        <pub-id pub-id-type="doi">10.1371/journal.pone.0055205</pub-id>
        <pub-id pub-id-type="medline">23372837</pub-id>
        <pub-id pub-id-type="pii">PONE-D-12-26520</pub-id>
        <pub-id pub-id-type="pmcid">PMC3555864</pub-id></nlm-citation>
      </ref>
      <ref id="ref11">
        <label>11</label>
        <nlm-citation citation-type="web">
        <source>The World in 2015: ICT Facts and Figures</source>  
        <year>2014</year>  
        <access-date>2015-07-09</access-date>
        <publisher-loc>Geneva, Switwerland</publisher-loc>
        <publisher-name>International Telecommunication Union</publisher-name>
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://www.itu.int/en/ITU-D/Statistics/Documents/facts/ICTFactsFigures2015.pdf">http://www.itu.int/en/ITU-D/Statistics/Documents/facts/ICTFactsFigures2015.pdf</ext-link>
          <ext-link ext-link-type="webcite" xlink:href="6Zt1Nb6VZ"/>
        </comment> </nlm-citation>
      </ref>
      <ref id="ref12">
        <label>12</label>
        <nlm-citation citation-type="web">
        <source>Infectious Disease Statistics System, Korea Centers for Disease Control and Prevention</source>  
        <access-date>2015-07-09</access-date>
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://is.cdc.go.kr/dstat/index.jsp">http://is.cdc.go.kr/dstat/index.jsp</ext-link>
          <ext-link ext-link-type="webcite" xlink:href="6Zt0Q7gKi"/>
        </comment> </nlm-citation>
      </ref>
      <ref id="ref13">
        <label>13</label>
        <nlm-citation citation-type="web">
        <source>NAVER</source>  
        <access-date>2015-07-09</access-date>
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://section.blog.naver.com">http://section.blog.naver.com</ext-link>
          <ext-link ext-link-type="webcite" xlink:href="6Zt0XhwPJ"/>
        </comment> </nlm-citation>
      </ref>
      <ref id="ref14">
        <label>14</label>
        <nlm-citation citation-type="web">
        <source>Daum</source>  
        <access-date>2015-07-09</access-date>
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://www.daum.net/">http://www.daum.net/</ext-link>
          <ext-link ext-link-type="webcite" xlink:href="6Zt0cdyl1"/>
        </comment> </nlm-citation>
      </ref>
      <ref id="ref15">
        <label>15</label>
        <nlm-citation citation-type="web">
        <source>Internet Trend</source>  
        <year>2015</year>  
        <access-date>2014-07-06</access-date>
        <comment>Market share of search engine in South Korea 
        <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://www.internettrend.co.kr/trendForward.tsp">http://www.internettrend.co.kr/trendForward.tsp</ext-link>
        <ext-link ext-link-type="webcite" xlink:href="6QrnPuRwJ"/></comment> </nlm-citation>
      </ref>
      <ref id="ref16">
        <label>16</label>
        <nlm-citation citation-type="confproc">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>He</surname>
            <given-names>Q</given-names>
          </name>
          <name name-style="western">
            <surname>Jiang</surname>
            <given-names>D</given-names>
          </name>
          <name name-style="western">
            <surname>Liao</surname>
            <given-names>Z</given-names>
          </name>
        </person-group>
        <article-title>Web query recommendation via sequential query prediction</article-title>
        <year>2009</year>  
        <month>03</month>  
        <day>29</day>  
        <conf-name>IEEE 25th International Conference on Data Engineering</conf-name>
        <conf-date>Mar 29-Apr 2, 2009</conf-date>
        <conf-loc>Shanghai</conf-loc>
        <fpage>1443</fpage>  
        <lpage>1454</lpage>  
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://ieeexplore.ieee.org/xpls/abs_all.jsp?arnumber=4812545"/>
          <ext-link ext-link-type="webcite" xlink:href="6iPJPmh3V"/>
        </comment> </nlm-citation>
      </ref>
      <ref id="ref17">
        <label>17</label>
        <nlm-citation citation-type="book">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Baeza-Yates</surname>
            <given-names>R</given-names>
          </name>
          <name name-style="western">
            <surname>Hurtado</surname>
            <given-names>C</given-names>
          </name>
          <name name-style="western">
            <surname>Mendoza</surname>
            <given-names>M</given-names>
          </name>
        </person-group>
        <article-title>Query recommendation using query logs in search engines</article-title>
        <source>Current Trends in Database Technology - EDBT 2004 Workshops</source>  
        <year>2004</year>  
        <month>03</month>  
        <day>14</day>  
        <publisher-loc>Berlin</publisher-loc>
        <publisher-name>Springer Berlin Heidelberg</publisher-name>
        <fpage>588</fpage>  
        <lpage>596</lpage> </nlm-citation>
      </ref>
      <ref id="ref18">
        <label>18</label>
        <nlm-citation citation-type="web">
        <source>Centers for Disease Control and Prevention</source>  
        <year>2015</year>  
        <access-date>2015-07-09</access-date>
        <comment>Flu symptoms and complications 
        <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://www.cdc.gov/FLU/ABOUT/disease/symptoms.htm">http://www.cdc.gov/FLU/ABOUT/disease/symptoms.htm</ext-link>
        <ext-link ext-link-type="webcite" xlink:href="6Zt0j4vyV"/></comment> </nlm-citation>
      </ref>
      <ref id="ref19">
        <label>19</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Guyon</surname>
            <given-names>I</given-names>
          </name>
          <name name-style="western">
            <surname>Elisseeff</surname>
            <given-names>A</given-names>
          </name>
        </person-group>
        <article-title>An introduction to variable and feature selection</article-title>
        <source>J Mach Learn Res</source>  
        <year>2003</year>  
        <month>03</month>  
        <day>01</day>  
        <volume>3</volume>  
        <fpage>1157</fpage>  
        <lpage>1182</lpage> </nlm-citation>
      </ref>
      <ref id="ref20">
        <label>20</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Saeys</surname>
            <given-names>Y</given-names>
          </name>
          <name name-style="western">
            <surname>Inza</surname>
            <given-names>I</given-names>
          </name>
          <name name-style="western">
            <surname>Larrañaga</surname>
            <given-names>P</given-names>
          </name>
        </person-group>
        <article-title>A review of feature selection techniques in bioinformatics</article-title>
        <source>Bioinformatics</source>  
        <year>2007</year>  
        <month>10</month>  
        <day>1</day>  
        <volume>23</volume>  
        <issue>19</issue>  
        <fpage>2507</fpage>  
        <lpage>2517</lpage>  
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://bioinformatics.oxfordjournals.org/cgi/pmidlookup?view=long&#38;pmid=17720704"/>
        </comment>  
        <pub-id pub-id-type="doi">10.1093/bioinformatics/btm344</pub-id>
        <pub-id pub-id-type="medline">17720704</pub-id>
        <pub-id pub-id-type="pii">btm344</pub-id></nlm-citation>
      </ref>
      <ref id="ref21">
        <label>21</label>
        <nlm-citation citation-type="confproc">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Li</surname>
            <given-names>F</given-names>
          </name>
          <name name-style="western">
            <surname>Yang</surname>
            <given-names>Y</given-names>
          </name>
          <name name-style="western">
            <surname>Xing</surname>
            <given-names>E</given-names>
          </name>
        </person-group>
        <article-title>From lasso regression to feature vector machine</article-title>
        <year>2005</year>  
        <conf-name>Advances in Neural Information Processing Systems 18</conf-name>
        <conf-date>Dec 5-8, 2005</conf-date>
        <conf-loc>Vancouver, BC</conf-loc>
        <fpage>411</fpage>  
        <lpage>418</lpage> </nlm-citation>
      </ref>
      <ref id="ref22">
        <label>22</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Smola</surname>
            <given-names>AJ</given-names>
          </name>
          <name name-style="western">
            <surname>Schölkopf</surname>
            <given-names>B</given-names>
          </name>
        </person-group>
        <article-title>A tutorial on support vector regression</article-title>
        <source>Stat Comput</source>  
        <year>2004</year>  
        <month>08</month>  
        <volume>14</volume>  
        <issue>3</issue>  
        <fpage>199</fpage>  
        <lpage>222</lpage>  
        <pub-id pub-id-type="doi">10.1023/B:STCO.0000035301.49549.88</pub-id></nlm-citation>
      </ref>
      <ref id="ref23">
        <label>23</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Milinovich</surname>
            <given-names>GJ</given-names>
          </name>
          <name name-style="western">
            <surname>Williams</surname>
            <given-names>GM</given-names>
          </name>
          <name name-style="western">
            <surname>Clements</surname>
            <given-names>AC</given-names>
          </name>
          <name name-style="western">
            <surname>Hu</surname>
            <given-names>W</given-names>
          </name>
        </person-group>
        <article-title>Internet-based surveillance systems for monitoring emerging infectious diseases</article-title>
        <source>Lancet Infect Dis</source>  
        <year>2014</year>  
        <month>02</month>  
        <volume>14</volume>  
        <issue>2</issue>  
        <fpage>160</fpage>  
        <lpage>168</lpage>  
        <pub-id pub-id-type="doi">10.1016/S1473-3099(13)70244-5</pub-id>
        <pub-id pub-id-type="medline">24290841</pub-id>
        <pub-id pub-id-type="pii">S1473-3099(13)70244-5</pub-id></nlm-citation>
      </ref>
      <ref id="ref24">
        <label>24</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Althouse</surname>
            <given-names>BM</given-names>
          </name>
          <name name-style="western">
            <surname>Ng</surname>
            <given-names>YY</given-names>
          </name>
          <name name-style="western">
            <surname>Cummings</surname>
            <given-names>DA</given-names>
          </name>
        </person-group>
        <article-title>Prediction of dengue incidence using search query surveillance</article-title>
        <source>PLoS Negl Trop Dis</source>  
        <year>2011</year>  
        <month>08</month>  
        <volume>5</volume>  
        <issue>8</issue>  
        <fpage>e1258</fpage>  
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://dx.plos.org/10.1371/journal.pntd.0001258"/>
        </comment>  
        <pub-id pub-id-type="doi">10.1371/journal.pntd.0001258</pub-id>
        <pub-id pub-id-type="medline">21829744</pub-id>
        <pub-id pub-id-type="pii">PNTD-D-11-00369</pub-id>
        <pub-id pub-id-type="pmcid">PMC3149016</pub-id></nlm-citation>
      </ref>
      <ref id="ref25">
        <label>25</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Bernardo</surname>
            <given-names>TM</given-names>
          </name>
          <name name-style="western">
            <surname>Rajic</surname>
            <given-names>A</given-names>
          </name>
          <name name-style="western">
            <surname>Young</surname>
            <given-names>I</given-names>
          </name>
          <name name-style="western">
            <surname>Robiadek</surname>
            <given-names>K</given-names>
          </name>
          <name name-style="western">
            <surname>Pham</surname>
            <given-names>MT</given-names>
          </name>
          <name name-style="western">
            <surname>Funk</surname>
            <given-names>JA</given-names>
          </name>
        </person-group>
        <article-title>Scoping review on search queries and social media for disease surveillance: a chronology of innovation</article-title>
        <source>J Med Internet Res</source>  
        <year>2013</year>  
        <volume>15</volume>  
        <issue>7</issue>  
        <fpage>e147</fpage>  
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://www.jmir.org/2013/7/e147/"/>
        </comment>  
        <pub-id pub-id-type="doi">10.2196/jmir.2740</pub-id>
        <pub-id pub-id-type="medline">23896182</pub-id>
        <pub-id pub-id-type="pii">v15i7e147</pub-id>
        <pub-id pub-id-type="pmcid">PMC3785982</pub-id></nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
