<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" article-type="letter" dtd-version="2.0">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JMIR</journal-id>
      <journal-id journal-id-type="nlm-ta">J Med Internet Res</journal-id>
      <journal-title>Journal of Medical Internet Research</journal-title>
      <issn pub-type="epub">1438-8871</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
      <article-id pub-id-type="publisher-id">v23i2e10969</article-id>
      <article-id pub-id-type="pmid">33570496</article-id>
      <article-id pub-id-type="doi">10.2196/10969</article-id>
      <article-categories>
        <subj-group subj-group-type="heading">
          <subject>Letter to the Editor</subject>
        </subj-group>
        <subj-group subj-group-type="article-type">
          <subject>Letter to the Editor</subject>
        </subj-group>
      </article-categories>
      <title-group>
        <article-title>Data Leakage in Health Outcomes Prediction With Machine Learning. Comment on “Prediction of Incident Hypertension Within the Next Year: Prospective Study Using Statewide Electronic Health Records and Machine Learning”</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="editor">
          <name>
            <surname>Eysenbach</surname>
            <given-names>Gunther</given-names>
          </name>
        </contrib>
        <contrib contrib-type="editor">
          <name>
            <surname>Derrick</surname>
            <given-names>Thomas</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Goris</surname>
            <given-names>Johan</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib id="contrib1" contrib-type="author" corresp="yes">
          <name name-style="western">
            <surname>Chiavegatto Filho</surname>
            <given-names>Alexandre</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <address>
            <institution>Department of Epidemiology</institution>
            <institution>School of Public Health</institution>
            <institution>University of São Paulo</institution>
            <addr-line>Av Dr Arnaldo 715</addr-line>
            <addr-line>São Paulo</addr-line>
            <country>Brazil</country>
            <phone>55 955543047</phone>
            <email>alexdiasporto@usp.br</email>
          </address>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-3251-9600</ext-link>
        </contrib>
        <contrib id="contrib2" contrib-type="author">
          <name name-style="western">
            <surname>Batista</surname>
            <given-names>André Filipe De Moraes</given-names>
          </name>
          <degrees>MSc, PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-4627-0244</ext-link>
        </contrib>
        <contrib id="contrib3" contrib-type="author">
          <name name-style="western">
            <surname>dos Santos</surname>
            <given-names>Hellen Geremias</given-names>
          </name>
          <degrees>MPH, PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-6446-8660</ext-link>
        </contrib>
      </contrib-group>
      <aff id="aff1">
        <label>1</label>
        <institution>Department of Epidemiology</institution>
        <institution>School of Public Health</institution>
        <institution>University of São Paulo</institution>
        <addr-line>São Paulo</addr-line>
        <country>Brazil</country>
      </aff>
      <author-notes>
        <corresp>Corresponding Author: Alexandre Chiavegatto Filho <email>alexdiasporto@usp.br</email></corresp>
      </author-notes>
      <pub-date pub-type="collection">
        <month>2</month>
        <year>2021</year>
      </pub-date>
      <pub-date pub-type="epub">
        <day>11</day>
        <month>2</month>
        <year>2021</year>
      </pub-date>
      <volume>23</volume>
      <issue>2</issue>
      <elocation-id>e10969</elocation-id>
      <history>
        <date date-type="received">
          <day>4</day>
          <month>5</month>
          <year>2018</year>
        </date>
        <date date-type="rev-request">
          <day>25</day>
          <month>10</month>
          <year>2020</year>
        </date>
        <date date-type="rev-recd">
          <day>8</day>
          <month>11</month>
          <year>2020</year>
        </date>
        <date date-type="accepted">
          <day>27</day>
          <month>1</month>
          <year>2021</year>
        </date>
      </history>
      <copyright-statement>©Alexandre Chiavegatto Filho, André Filipe De Moraes Batista, Hellen Geremias dos Santos. Originally published in the Journal of Medical Internet Research (http://www.jmir.org), 11.02.2021.</copyright-statement>
      <copyright-year>2021</copyright-year>
      <license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
        <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (https://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in the Journal of Medical Internet Research, is properly cited. The complete bibliographic information, a link to the original publication on http://www.jmir.org/, as well as this copyright and license information must be included.</p>
      </license>
      <self-uri xlink:href="https://www.jmir.org/2021/2/e10969" xlink:type="simple"/>
      <related-article related-article-type="commentary-article" id="v20i1e22" ext-link-type="doi" xlink:href="10.2196/jmir.9268" vol="20" page="e22" xlink:type="simple">https://www.jmir.org/2018/1/e22/</related-article>
      <kwd-group>
        <kwd>machine learning</kwd>
        <kwd>data leakage</kwd>
        <kwd>prediction</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <p>Applications of machine learning algorithms to predict the incidence of health outcomes have an enormous potential to improve clinical practice and lower health care costs [<xref ref-type="bibr" rid="ref1">1</xref>]. Machine learning is a subset of artificial intelligence that uses data to improve decisions through experience, which is especially promising in a data-driven world. Dr Ye and colleagues’ article on hypertension incidence prediction in the <italic>Journal of Medical Internet Research</italic> adds to this literature [<xref ref-type="bibr" rid="ref2">2</xref>], but its potential contribution and applicability are hindered by a major flaw.</p>
    <p>The objective of the study was to “develop and validate prospectively a risk prediction model of incident essential hypertension within the following year.” The authors follow good prediction protocols by applying a high-performing machine learning algorithm (XGBoost) and by validating the results on unseen data from the following year. The algorithm attained a very high area under the curve (AUC) value of 0.870 for incidence prediction of hypertension in the following year.</p>
    <p>The authors follow this impressive result by commenting on some of the most important predictive variables, such as demographic features, diagnosed chronic diseases, and mental illness. The ranking of the variables that were most important for the predictive performance of hypertension is included in a multimedia appendix; however, the above-mentioned variables are not listed near the top. Of the six most important variables, five were: lisinopril, hydrochlorothiazide, enalapril maleate, amlodipine besylate, and losartan potassium. All of these are popular antihypertensive drugs.</p>
    <p>Data leakage occurs when one or more features used to train the algorithm has hidden within itself the result of the outcome, and is considered one of the most frequent mistakes in machine learning [<xref ref-type="bibr" rid="ref3">3</xref>]. This is different from predictive importance, that is, the relative effect of each variable in increasing or decreasing the expected outcome, as it usually comes after the outcome. Therefore, it is a consequence of the outcome that is being predicted and not the other way around.</p>
    <p>A classic example from machine learning textbooks is the inclusion of the ID number of the patient as a predictor. While this should not have predictive importance if randomly assigned, it is common that patients coming from the same hospital have similar ID numbers in multicenter data sets. In the case of cancer prediction, for example, machine learning algorithms will learn that similar ID numbers that come from oncology hospitals have a higher probability of cancer.</p>
    <p>As an example, we used real data to test the effect of including mechanical ventilation to predict intensive care unit (ICU) admission among patients with COVID-19 [<xref ref-type="bibr" rid="ref4">4</xref>]. This is another example of data leakage, as mechanical ventilation usually only occurs after ICU admission and should not be used to predict its risk. <xref rid="figure1" ref-type="fig">Figure 1</xref> shows the decrease in the prediction metrics for ICU admission with the exclusion of mechanical ventilation as a predictor, with the area under the ROC (receiver operating characteristic) curve decreasing from 0.76 to 0.64, and precision from 0.49 to 0.17.</p>
    <fig id="figure1" position="float">
      <label>Figure 1</label>
      <caption>
        <p>Performance metrics for the prediction of intensive care unit (ICU) admission with and without the use of mechanical ventilation as a predictor.</p>
      </caption>
      <graphic xlink:href="jmir_v23i2e10969_fig1.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
    </fig>
    <p>By including the use of antihypertensive drugs as predictors for hypertension incidence in the following year, Dr Ye and colleagues’ work opens the possibility that the machine learning algorithm will focus on predicting those already with hypertension but did not have this information on their medical record at baseline. While this would work for a prediction competition, where data science teams compete to produce the best predictive model such as in a Kaggle challenge [<xref ref-type="bibr" rid="ref5">5</xref>], it is not of particular scientific or clinical interest. In the case of the latter, just one variable (the use of a hypertension drug) is sufficient for physicians to infer the presence of hypertension, while for the former, the knowledge of this being a highly predictable event (as measured by the AUC) is severely impaired.</p>
    <p>In order to identify the presence of data leakage in prediction studies, it is important to have a conceptual pathway of how the predictors longitudinally affect the outcome variable, as there is no statistical method that is capable of pointing out the presence of data leakage. Improving the predictive performance of specific data sets for different diseases is an important new field in epidemiology and data science. The authors can still contribute to this literature by providing the new AUC of the prediction after addressing the data leakage issue.</p>
  </body>
  <back>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">AUC</term>
          <def>
            <p>area under the curve</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb2">ICU</term>
          <def>
            <p>intensive care unit</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb3">ROC</term>
          <def>
            <p>receiver operating characteristic</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <fn-group>
      <fn fn-type="other">
        <p><bold>Editorial Notice</bold></p>
        <p>The corresponding author of “Prediction of Incident Hypertension Within the Next Year: Prospective Study Using Statewide Electronic Health Records and Machine Learning” did not respond to our invitation to reply to this commentary.</p>
      </fn>        
    <fn fn-type="conflict">
        <p>None declared.</p>
      </fn>
    </fn-group>
    <app-group/>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Obermeyer</surname>
              <given-names>Ziad</given-names>
            </name>
            <name name-style="western">
              <surname>Emanuel</surname>
              <given-names>Ezekiel J</given-names>
            </name>
          </person-group>
          <article-title>Predicting the Future - Big Data, Machine Learning, and Clinical Medicine</article-title>
          <source>N Engl J Med</source>
          <year>2016</year>
          <month>09</month>
          <day>29</day>
          <volume>375</volume>
          <issue>13</issue>
          <fpage>1216</fpage>
          <lpage>9</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/27682033"/>
          </comment>
          <pub-id pub-id-type="doi">10.1056/NEJMp1606181</pub-id>
          <pub-id pub-id-type="medline">27682033</pub-id>
          <pub-id pub-id-type="pmcid">PMC5070532</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ye</surname>
              <given-names>Chengyin</given-names>
            </name>
            <name name-style="western">
              <surname>Fu</surname>
              <given-names>Tianyun</given-names>
            </name>
            <name name-style="western">
              <surname>Hao</surname>
              <given-names>Shiying</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>Yan</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>Oliver</given-names>
            </name>
            <name name-style="western">
              <surname>Jin</surname>
              <given-names>Bo</given-names>
            </name>
            <name name-style="western">
              <surname>Xia</surname>
              <given-names>Minjie</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>Modi</given-names>
            </name>
            <name name-style="western">
              <surname>Zhou</surname>
              <given-names>Xin</given-names>
            </name>
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>Qian</given-names>
            </name>
            <name name-style="western">
              <surname>Guo</surname>
              <given-names>Yanting</given-names>
            </name>
            <name name-style="western">
              <surname>Zhu</surname>
              <given-names>Chunqing</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>Yu-Ming</given-names>
            </name>
            <name name-style="western">
              <surname>Culver</surname>
              <given-names>Devore S</given-names>
            </name>
            <name name-style="western">
              <surname>Alfreds</surname>
              <given-names>Shaun T</given-names>
            </name>
            <name name-style="western">
              <surname>Stearns</surname>
              <given-names>Frank</given-names>
            </name>
            <name name-style="western">
              <surname>Sylvester</surname>
              <given-names>Karl G</given-names>
            </name>
            <name name-style="western">
              <surname>Widen</surname>
              <given-names>Eric</given-names>
            </name>
            <name name-style="western">
              <surname>McElhinney</surname>
              <given-names>Doff</given-names>
            </name>
            <name name-style="western">
              <surname>Ling</surname>
              <given-names>Xuefeng</given-names>
            </name>
          </person-group>
          <article-title>Prediction of Incident Hypertension Within the Next Year: Prospective Study Using Statewide Electronic Health Records and Machine Learning</article-title>
          <source>J Med Internet Res</source>
          <year>2018</year>
          <month>01</month>
          <day>30</day>
          <volume>20</volume>
          <issue>1</issue>
          <fpage>e22</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.jmir.org/2018/1/e22/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/jmir.9268</pub-id>
          <pub-id pub-id-type="medline">29382633</pub-id>
          <pub-id pub-id-type="pii">v20i1e22</pub-id>
          <pub-id pub-id-type="pmcid">PMC5811646</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kaufman</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Rosset</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Perlich</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>Leakage in Data Mining: Formulation, Detection, and Avoidance</article-title>
          <year>2011</year>
          <conf-name>Proceedings of the 17th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining</conf-name>
          <conf-date>August 21-24, 2011</conf-date>
          <conf-loc>San Diego, CA, USA</conf-loc>
          <fpage>556</fpage>
          <lpage>563</lpage>
          <pub-id pub-id-type="doi">10.1145/2020408.2020496</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="web">
          <article-title>Database: Covid-mexico (predictive models)</article-title>
          <source>GitHub</source>
          <year>2020</year>
          <access-date>2020-11-01</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://github.com/salomonw/covid-mexico">https://github.com/salomonw/covid-mexico</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Nisbet</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Elder</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Miner</surname>
              <given-names>G</given-names>
            </name>
          </person-group>
          <source>Handbook of Statistical Analysis and Data Mining Applications</source>
          <year>2009</year>
          <publisher-loc>Burlington, MA</publisher-loc>
          <publisher-name>Elsevier</publisher-name>
        </nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
