<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article" dtd-version="2.0">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JMIR</journal-id>
      <journal-id journal-id-type="nlm-ta">J Med Internet Res</journal-id>
      <journal-title>Journal of Medical Internet Research</journal-title>
      <issn pub-type="epub">1438-8871</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
      <article-id pub-id-type="publisher-id">v24i6e34295</article-id>
      <article-id pub-id-type="pmid">35502887</article-id>
      <article-id pub-id-type="doi">10.2196/34295</article-id>
      <article-categories>
        <subj-group subj-group-type="heading">
          <subject>Original Paper</subject>
        </subj-group>
        <subj-group subj-group-type="article-type">
          <subject>Original Paper</subject>
        </subj-group>
      </article-categories>
      <title-group>
        <article-title>Machine Learning–Based Prediction Models for Different Clinical Risks in Different Hospitals: Evaluation of Live Performance</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="editor">
          <name>
            <surname>Leung</surname>
            <given-names>Tiffany</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Domínguez-Olmedo</surname>
            <given-names>Juan L</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Bajpai</surname>
            <given-names>Ram</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Zhang</surname>
            <given-names>Xiangzhou</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib id="contrib1" contrib-type="author" corresp="yes" equal-contrib="yes">
          <name name-style="western">
            <surname>Sun</surname>
            <given-names>Hong</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <address>
            <institution>Dedalus Healthcare</institution>
            <addr-line>Roderveldlaan 2</addr-line>
            <addr-line>Antwerp, 2600</addr-line>
            <country>Belgium</country>
            <phone>32 3444 8108</phone>
            <email>hong.sun@dedalus.com</email>
          </address>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-7112-5420</ext-link>
        </contrib>
        <contrib id="contrib2" contrib-type="author" equal-contrib="yes">
          <name name-style="western">
            <surname>Depraetere</surname>
            <given-names>Kristof</given-names>
          </name>
          <degrees>MSc</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-3859-3791</ext-link>
        </contrib>
        <contrib id="contrib3" contrib-type="author">
          <name name-style="western">
            <surname>Meesseman</surname>
            <given-names>Laurent</given-names>
          </name>
          <degrees>MD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-3761-9822</ext-link>
        </contrib>
        <contrib id="contrib4" contrib-type="author">
          <name name-style="western">
            <surname>Cabanillas Silva</surname>
            <given-names>Patricia</given-names>
          </name>
          <degrees>MSc</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-7509-721X</ext-link>
        </contrib>
        <contrib id="contrib5" contrib-type="author">
          <name name-style="western">
            <surname>Szymanowsky</surname>
            <given-names>Ralph</given-names>
          </name>
          <degrees>MSc</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-3765-5794</ext-link>
        </contrib>
        <contrib id="contrib6" contrib-type="author">
          <name name-style="western">
            <surname>Fliegenschmidt</surname>
            <given-names>Janis</given-names>
          </name>
          <degrees>BSc</degrees>
          <xref rid="aff2" ref-type="aff">2</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-7396-5677</ext-link>
        </contrib>
        <contrib id="contrib7" contrib-type="author">
          <name name-style="western">
            <surname>Hulde</surname>
            <given-names>Nikolai</given-names>
          </name>
          <degrees>MD, PhD</degrees>
          <xref rid="aff2" ref-type="aff">2</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-5070-0249</ext-link>
        </contrib>
        <contrib id="contrib8" contrib-type="author">
          <name name-style="western">
            <surname>von Dossow</surname>
            <given-names>Vera</given-names>
          </name>
          <degrees>MD, PhD, Prof Dr</degrees>
          <xref rid="aff2" ref-type="aff">2</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-2540-7080</ext-link>
        </contrib>
        <contrib id="contrib9" contrib-type="author">
          <name name-style="western">
            <surname>Vanbiervliet</surname>
            <given-names>Martijn</given-names>
          </name>
          <degrees>MSc</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-6951-4585</ext-link>
        </contrib>
        <contrib id="contrib10" contrib-type="author">
          <name name-style="western">
            <surname>De Baerdemaeker</surname>
            <given-names>Jos</given-names>
          </name>
          <degrees>MSc</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-4829-1700</ext-link>
        </contrib>
        <contrib id="contrib11" contrib-type="author">
          <name name-style="western">
            <surname>Roccaro-Waldmeyer</surname>
            <given-names>Diana M</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-4401-1567</ext-link>
        </contrib>
        <contrib id="contrib12" contrib-type="author">
          <name name-style="western">
            <surname>Stieg</surname>
            <given-names>Jörg</given-names>
          </name>
          <degrees>MSc</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-2128-4017</ext-link>
        </contrib>
        <contrib id="contrib13" contrib-type="author">
          <name name-style="western">
            <surname>Domínguez Hidalgo</surname>
            <given-names>Manuel</given-names>
          </name>
          <degrees>MSc</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-5356-6735</ext-link>
        </contrib>
        <contrib id="contrib14" contrib-type="author">
          <name name-style="western">
            <surname>Dahlweid</surname>
            <given-names>Fried-Michael</given-names>
          </name>
          <degrees>MD, PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-5416-9915</ext-link>
        </contrib>
      </contrib-group>
      <aff id="aff1">
        <label>1</label>
        <institution>Dedalus Healthcare</institution>
        <addr-line>Antwerp</addr-line>
        <country>Belgium</country>
      </aff>
      <aff id="aff2">
        <label>2</label>
        <institution>Institute of Anesthesiology and Pain Therapy</institution>
        <institution>Heart and Diabetes Centre North Rhine-Westphalia</institution>
        <institution>University Hospital of Ruhr-University Bochum</institution>
        <addr-line>Bad Oeynhausen</addr-line>
        <country>Germany</country>
      </aff>
      <author-notes>
        <corresp>Corresponding Author: Hong Sun <email>hong.sun@dedalus.com</email></corresp>
      </author-notes>
      <pub-date pub-type="collection">
        <month>6</month>
        <year>2022</year>
      </pub-date>
      <pub-date pub-type="epub">
        <day>7</day>
        <month>6</month>
        <year>2022</year>
      </pub-date>
      <volume>24</volume>
      <issue>6</issue>
      <elocation-id>e34295</elocation-id>
      <history>
        <date date-type="received">
          <day>19</day>
          <month>10</month>
          <year>2021</year>
        </date>
        <date date-type="rev-request">
          <day>7</day>
          <month>1</month>
          <year>2022</year>
        </date>
        <date date-type="rev-recd">
          <day>25</day>
          <month>2</month>
          <year>2022</year>
        </date>
        <date date-type="accepted">
          <day>12</day>
          <month>4</month>
          <year>2022</year>
        </date>
      </history>
      <copyright-statement>©Hong Sun, Kristof Depraetere, Laurent Meesseman, Patricia Cabanillas Silva, Ralph Szymanowsky, Janis Fliegenschmidt, Nikolai Hulde, Vera von Dossow, Martijn Vanbiervliet, Jos De Baerdemaeker, Diana M Roccaro-Waldmeyer, Jörg Stieg, Manuel Domínguez Hidalgo, Fried-Michael Dahlweid. Originally published in the Journal of Medical Internet Research (https://www.jmir.org), 07.06.2022.</copyright-statement>
      <copyright-year>2022</copyright-year>
      <license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
        <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (https://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in the Journal of Medical Internet Research, is properly cited. The complete bibliographic information, a link to the original publication on https://www.jmir.org/, as well as this copyright and license information must be included.</p>
      </license>
      <self-uri xlink:href="https://www.jmir.org/2022/6/e34295" xlink:type="simple"/>
      <abstract>
        <sec sec-type="background">
          <title>Background</title>
          <p>Machine learning algorithms are currently used in a wide array of clinical domains to produce models that can predict clinical risk events. Most models are developed and evaluated with retrospective data, very few are evaluated in a clinical workflow, and even fewer report performances in different hospitals. In this study, we provide detailed evaluations of clinical risk prediction models in live clinical workflows for three different use cases in three different hospitals.</p>
        </sec>
        <sec sec-type="objective">
          <title>Objective</title>
          <p>The main objective of this study was to evaluate clinical risk prediction models in live clinical workflows and compare their performance in these setting with their performance when using retrospective data. We also aimed at generalizing the results by applying our investigation to three different use cases in three different hospitals.</p>
        </sec>
        <sec sec-type="methods">
          <title>Methods</title>
          <p>We trained clinical risk prediction models for three use cases (ie, delirium, sepsis, and acute kidney injury) in three different hospitals with retrospective data. We used machine learning and, specifically, deep learning to train models that were based on the Transformer model. The models were trained using a calibration tool that is common for all hospitals and use cases. The models had a common design but were calibrated using each hospital’s specific data. The models were deployed in these three hospitals and used in daily clinical practice. The predictions made by these models were logged and correlated with the diagnosis at discharge. We compared their performance with evaluations on retrospective data and conducted cross-hospital evaluations.</p>
        </sec>
        <sec sec-type="results">
          <title>Results</title>
          <p>The performance of the prediction models with data from live clinical workflows was similar to the performance with retrospective data. The average value of the area under the receiver operating characteristic curve (AUROC) decreased slightly by 0.6 percentage points (from 94.8% to 94.2% at discharge). The cross-hospital evaluations exhibited severely reduced performance: the average AUROC decreased by 8 percentage points (from 94.2% to 86.3% at discharge), which indicates the importance of model calibration with data from the deployment hospital.</p>
        </sec>
        <sec sec-type="conclusions">
          <title>Conclusions</title>
          <p>Calibrating the prediction model with data from different deployment hospitals led to good performance in live settings. The performance degradation in the cross-hospital evaluation identified limitations in developing a generic model for different hospitals. Designing a generic process for model development to generate specialized prediction models for each hospital guarantees model performance in different hospitals.</p>
        </sec>
      </abstract>
      <kwd-group>
        <kwd>machine learning</kwd>
        <kwd>clinical risk prediction</kwd>
        <kwd>prediction</kwd>
        <kwd>model</kwd>
        <kwd>model evaluation</kwd>
        <kwd>scalability</kwd>
        <kwd>risk</kwd>
        <kwd>live clinical workflow</kwd>
        <kwd>delirium</kwd>
        <kwd>sepsis</kwd>
        <kwd>acute kidney injury</kwd>
        <kwd>kidney</kwd>
        <kwd>EHR</kwd>
        <kwd>electronic health record</kwd>
        <kwd>workflow</kwd>
        <kwd>algorithm</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <sec sec-type="introduction">
      <title>Introduction</title>
      <p>Machine learning algorithms for clinical risk predictions are widely used in health care research and applications [<xref ref-type="bibr" rid="ref1">1</xref>-<xref ref-type="bibr" rid="ref5">5</xref>]. While much work has been done on developing distinct clinical risk prediction models, the scalability of the prediction models has been much less explored (ie, the extensibility of the risk prediction model for multiple diseases over different hospitals) [<xref ref-type="bibr" rid="ref6">6</xref>].</p>
      <p>Rajkomar et al [<xref ref-type="bibr" rid="ref6">6</xref>] designed a single data structure based on the FHIR (Fast Healthcare Interoperability Resources) standard [<xref ref-type="bibr" rid="ref7">7</xref>] and developed different clinical scenarios over two hospitals with this common data structure. That was the first study that reported the performance of prediction models of multiple use cases in different hospitals. Churpek et al [<xref ref-type="bibr" rid="ref8">8</xref>] aggregated the electronic health record (EHR) from five hospitals to train a single model to make predictions on cardiac arrest, intensive care unit (ICU) transfers, or death on wards. The performance of the model outperforms the existing Modified Early Warning Score. The limitation is that both studies [<xref ref-type="bibr" rid="ref6">6</xref>,<xref ref-type="bibr" rid="ref8">8</xref>] were validated with retrospective data and have not yet been used in a live clinical workflow.</p>
      <p>In our previous publication [<xref ref-type="bibr" rid="ref9">9</xref>], we discussed the scalability issue in clinical risk prediction model development; we also presented a scalable approach for prediction model development that is applied to delirium, sepsis, and acute kidney injury (AKI) covering four different hospitals. However, these prediction models were only evaluated on retrospective data.</p>
      <p>Evaluating the prediction models in live clinical settings is crucial because factors such as interoperability across different platforms or different prevalence can affect the performance of an artificial intelligence (AI) algorithm [<xref ref-type="bibr" rid="ref10">10</xref>,<xref ref-type="bibr" rid="ref11">11</xref>]. However, very few prediction models have been evaluated in a live clinical workflow. For example, several delirium prediction models that have been reported in recent years [<xref ref-type="bibr" rid="ref9">9</xref>,<xref ref-type="bibr" rid="ref12">12</xref>,<xref ref-type="bibr" rid="ref13">13</xref>] have all been evaluated on retrospective data. Jauk et al [<xref ref-type="bibr" rid="ref14">14</xref>] claimed their findings to be the only delirium prediction model that has been evaluated in a live clinical workflow. In their study, 5530 predictions were analyzed, and 119 predictions were compared with ratings of clinical experts during a period of 7 months. The limitation of Jauk et al [<xref ref-type="bibr" rid="ref14">14</xref>] is that their model only evaluated in a single hospital. When a prediction model is evaluated in different hospitals, the performance may degrade due to the difference in EHRs and workflows between the training data and the target hospital. Wong et al [<xref ref-type="bibr" rid="ref15">15</xref>] reported large performance degradation on sepsis prediction when a sepsis prediction model was applied in a different hospital.</p>
      <p>Wu et al [<xref ref-type="bibr" rid="ref16">16</xref>] considered it important to evaluate AI-based medical devices over different sites with live clinical settings to address the shortcomings, such as overfitting to training data and bias against underrepresented subgroups, among others. They investigated 130 US Food and Drug Administration–approved AI devices: 126 evaluations were performed as retrospective studies and 93 devices did not have multiple site evaluations.</p>
      <p>In this paper, we evaluated clinical risk prediction models (ie, delirium, sepsis, and AKI) in live clinical workflows in three different hospitals in Germany. We compared the performance of the models with their performance on retrospective data from our previous work. By logging prediction requests in the production EHR system, we ran cross-hospital evaluations mimicking the performance of a prediction model in live clinical workflows of different target hospitals. Domain experts executed preliminary evaluations on clinical soundness and usefulness of the predictions by following the use of the prediction service in their daily practice.</p>
      <p>To the best of our knowledge, we are the first to report the evaluation of machine learning–based clinical risk prediction models in the settings of production EHR systems, which focuses on evaluating several diseases in different hospitals at the same time. In addition, in the cross-hospital evaluation, we simulated the performance of a prediction model in live clinical workflows of different target hospitals.</p>
    </sec>
    <sec sec-type="methods">
      <title>Methods</title>
      <sec>
        <title>Overview</title>
        <p>We used a scalable approach, implemented in a calibration tool, to generate clinical risk prediction models for different use cases in three different German hospitals based on retrospective EHR data: Marienhospital Stuttgart (from 2004 to 2020), Herz- und Diabeteszentrum Nordrhein-Westfalen Bad Oeynhausen (from 2009 to 2020), and Medius Klinik Nürtingen (from 2009 to 2020). The evaluation in live systems was performed in the first half of 2021; details of the evaluation period are provided in Table S1 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>. The characteristics of the training set are provided in Table S2 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>, and the characteristics of the evaluation samples, in live systems, are provided in Table S3 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>. We refer to these three hospitals as hospital M, hospital H, and hospital N, respectively. The calibration process that generates prediction models is described in our previous work [<xref ref-type="bibr" rid="ref9">9</xref>]. Using the calibration tool, models were trained independently on data from each hospital and deployed in the prediction service of the same hospital. Requests for predictions were generated from the EHR system in the FHIR [<xref ref-type="bibr" rid="ref7">7</xref>] “RiskAssessment” format and were sent to the prediction service. The prediction service parsed each prediction request into an observation, which was used to generate a prediction. The predictions were returned and displayed in the EHR system. The observation, together with the corresponding risk score produced by the prediction model, were stored for further evaluation of model performance.</p>
      </sec>
      <sec>
        <title>Model Development and Deployment</title>
        <p><xref rid="figure1" ref-type="fig">Figure 1</xref> shows the process of model development and evaluation with retrospective data. The process to design the prediction model, prepare data, and train models was defined following experiments performed on a development data set. The resulting dedicated process and prediction model design was implemented in an automated pipeline, named the calibration tool. The calibration tool provided a user-friendly approach to install, configure, and run the process of data preparation, model training, and evaluation on a customer-specific system. A command-line interface enabled service engineers to install the required software, files, and pretrained natural language processing (NLP) models and to execute the training and evaluation of the hospital-specific prediction models.</p>
        <p><xref rid="figure2" ref-type="fig">Figure 2</xref> shows the components and interactions of the calibration tool. The lower pane defines a fixed sequence of tasks to perform in order to calibrate models for the supported use cases with data from a target hospital. The upper pane contains a set of components that execute these tasks.</p>
        <p>We then ran the calibration tool independently in each hospital to generate clinical risk prediction models for each hospital. The models were trained based on the retrospective data that were generated as part of the clinical workflow of each target hospital. We thereby ensured that the model fit the clinical practice of the hospital where the model was to be deployed. The data checking process guaranteed that the source data were represented in the expected format. The data preparation process prepared the training and testing data. The labels of each use case were assigned by the labeler component based on the diagnosis codes assigned to each hospitalized patient at discharge. A common set of features was prepared and used by the different use cases, which included structured data, such as lab results and history of diagnosis, as well as clinical entities extracted from free-text clinical notes. Both a text search and a BERT (bidirectional encoder representations from transformers) [<xref ref-type="bibr" rid="ref17">17</xref>] named entity recognition model were used in preparing the NLP features. The following inclusion criteria were applied during data preparation: age and gender had to be known, patients had to be 18 years or older, only inpatients could be included, and length of stay had to be limited to 90 days.</p>
        <p>Prediction models were trained using a common model training strategy: we used the Transformer model [<xref ref-type="bibr" rid="ref18">18</xref>] to train a binary classification model for clinical risk prediction. We concatenated the features as inputs and used the labels as targets for the model training process. The models were trained with patient data that were collected at the time of discharge with leaking features removed. In order to cope with the situation where the model was requested to make predictions when less information was available, we applied data augmentation in training sample preparation: we generated partial records in combination with the complete records to enhance the robustness of the clinical risk prediction model. More details can be found in our previous work [<xref ref-type="bibr" rid="ref9">9</xref>]. The generated models were first examined with a model checking process, where a set of minimum requirements were assessed as unit tests. Models that passed the checks were further evaluated on their performance using common metrics, such as the area under the receiver operating characteristic curve (AUROC), sensitivity, and specificity, among others. Acceptance criteria were checked during the model evaluation process. The acceptance criteria differed among use cases and were checked for each department. Models that met the criteria could be activated in the corresponding departments to trigger alerts in the production EHR system. The acceptance criteria were complex and are explained in detail in Table S4 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p>
        <p>Risk prediction models evaluated in this paper were generated by our calibration tool in the three aforementioned German hospitals. The details of feature engineering and model training were presented in a former publication [<xref ref-type="bibr" rid="ref9">9</xref>]; examples of model input features are provided in Table S5 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>. Preliminary cross-hospital evaluation was performed with retrospective data in our previous study and performance degradation was observed [<xref ref-type="bibr" rid="ref9">9</xref>].</p>
        <fig id="figure1" position="float">
          <label>Figure 1</label>
          <caption>
            <p>Model development and evaluation with retrospective data.</p>
          </caption>
          <graphic xlink:href="jmir_v24i6e34295_fig1.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <fig id="figure2" position="float">
          <label>Figure 2</label>
          <caption>
            <p>Calibration tool.</p>
          </caption>
          <graphic xlink:href="jmir_v24i6e34295_fig2.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Model Evaluation With Live Data From the Clinical Workflow</title>
        <p><xref rid="figure3" ref-type="fig">Figure 3</xref> shows the process of model evaluation with live data from the clinical workflow. Prediction services were triggered following clinical events in the EHR system (eg, when new lab results for a patient were added to the system). The EHR system sent the relevant patient record to the prediction service, where the hospitals’ specialized prediction models for three different use cases, trained on the hospital data, were deployed. For each use case, the prediction model predicted the risk of developing the related disease and returned the risk score in response. Based on the defined thresholds, alerts were created in the EHR system for those that were predicted as high risk. For each prediction made by the prediction service, the corresponding request and response were stored by a logging service. By comparing the predictions made by the prediction service and the corresponding real labels, we evaluated the model performance in a live clinical workflow. Moreover, the prediction requests stored by the logging service could be used to generate predictions with a different model to simulate its performance in a live clinical workflow. This alternate model could be a model that is trained in the same hospital with a different training strategy, as well as a model that is trained on the data from a different hospital. For example, in <xref rid="figure3" ref-type="fig">Figure 3</xref>, the logging information stored in hospital A (ie, the hospital where risk predictions in a live EHR system are made) can be used to generate predictions with a model trained at hospital B (ie, a different hospital where a different risk prediction model is trained). By comparing those predictions with the real labels, it is possible to estimate the performance of the model of hospital B in the live clinical workflow of hospital A.</p>
        <p>To support the evaluation presented in this paper, the JSON file logging driver (ie, the default Docker logging service) [<xref ref-type="bibr" rid="ref19">19</xref>] was used to log the request and response of prediction services to separate JSON log files. Each prediction request log entry contained the date and time and the input features for the prediction. Each prediction response log entry contained the used input features and risk score for the prediction. An excerpt of a sample log of prediction requests is enclosed (Table S5 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). In the production EHR system, the prediction service can process a patient’s records and instantly make a corresponding prediction or explanation. Prediction models for delirium, sepsis, and AKI were installed in three hospitals. Prediction requests and responses were logged in these three hospitals as input for the evaluations. The response time for predictions was evaluated and provided (Figure S1 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>).</p>
        <fig id="figure3" position="float">
          <label>Figure 3</label>
          <caption>
            <p>Model evaluation with live data from the clinical workflow. Hospital A refers to the hospital where risk predictions in a live electronic health record system are made. Hospital B refers to a different hospital where a different risk prediction model is trained.</p>
          </caption>
          <graphic xlink:href="jmir_v24i6e34295_fig3.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Ethical Considerations</title>
        <p>Our study to assess model performance involved the analysis of unidentifiable patients, and data use was granted to us by the pilot hospitals—hospital H, hospital M, and hospital N—for this purpose, after appropriate review. Therefore, no ethics approval by the Institutional Review Board was required. The cohort study at hospital H was approved by the Ethics Committee of the Medical Faculty of the Ruhr-Universität Bochum (file No. Az.2021-861).</p>
      </sec>
    </sec>
    <sec sec-type="results">
      <title>Results</title>
      <sec>
        <title>Evaluation of Model Performance in Live EHR Systems</title>
        <p>The models made predictions at different stages during a patient stay with live data; however, the performance of the clinical risk prediction models within a live clinical workflow was evaluated at the end of the day of admission, as well as on the day of discharge. The reason we checked the performance of our prediction model at these two stages was to evaluate their performance when there were limited data compared to when sufficient data were available. Leaking information, such as strong diagnostic data or textual references to the diseases to be predicted, was excluded, following the settings we applied when we evaluated the model performance on retrospective data in our previous study [<xref ref-type="bibr" rid="ref9">9</xref>]. Taking these same strategies allowed a fair comparison between the performance achieved on live data with that obtained on retrospective data. The model performance was evaluated by the AUROC. We choose to evaluate using the AUROC because the sensitivity, specificity, and precision were dependent on the threshold (ie, defined by the point chosen on the receiver operating characteristic curve). The threshold is used by the hospitals to trigger an alert and may differ among hospitals because some hospitals favor sensitivity over specificity or vice versa. Using the AUROC allowed us to compare the outcome of three use cases at three different hospitals independently from this threshold. Sensitivity, specificity, and precision were used to decide on the threshold and are provided with the explanation of the model acceptance criteria (Table S4 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>).</p>
        <p><xref rid="figure4" ref-type="fig">Figure 4</xref> evaluates the model performance as assessed by the AUROC on the live data versus the retrospective data (Table S6 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). Each row in the table indicates the hospital in which the evaluation was performed. Each column indicates the use case and the point in time of the model evaluation (ie, either at the end of the admission day or at discharge). Positive values (ie, shades of green) indicate that the respective model performed better when evaluated on live data as compared to retrospective data, whereas negative values (ie, shades of red) indicate that models performed worse when evaluated on live data as compared to retrospective data. For example, the delirium model AUROC, evaluated at the end of the day of admission at hospital N, was 4.36 percentage points lower when the model was performed on the live data (AUROC=80.9%) as compared to retrospective data (AUROC=85.26%). On average, our delirium prediction model had a lower AUROC when evaluated on live data as compared to retrospective data. In contrast, our sepsis prediction model performed better on live data as compared to retrospective data, whereas the AKI prediction model performed equally well on both. At the hospital level, evaluation on live data led to higher model performance in hospital N (+0.1 percentage points) but to lower performance in hospitals M and H (–1.8 and –0.7 percentage points, respectively). When averaged across all three use cases and all three hospitals, the performance of our prediction models declined slightly when evaluated on live data (AUROC values: 83.1% at admission, 94.2% at discharge, and 88.6% on average) as compared to retrospective data (AUROC values: 83.0% at admission, 94.8% at discharge, and 89.4% on average).</p>
        <fig id="figure4" position="float">
          <label>Figure 4</label>
          <caption>
            <p>Model performance: live data vs retrospective data. The table was generated using the AUROC values for the live and retrospective data (Table S6 in Multimedia Appendix 1). adm: admission; AKI: acute kidney injury; AVG: average; AUROC: area under the receiver operating characteristic curve; dis: discharge; H: Herz- und Diabeteszentrum Nordrhein-Westfalen Bad Oeynhausen; M: Marienhospital Stuttgart; N: Medius Klinik Nürtingen.</p>
          </caption>
          <graphic xlink:href="jmir_v24i6e34295_fig4.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Cross-Hospital Evaluation</title>
        <p>Cross-hospital evaluation was performed by extracting the observations from the prediction request in one hospital and generating predictions using a model trained on data from a different hospital. We evaluated our models in a live clinical workflow based on the logging information stored in the prediction service. The prediction requests made at different stages of a medical stay were used to generate corresponding predictions by prediction models of other hospitals. By using the prediction models of other hospitals, we simulated the performance of these models in a live clinical workflow, without the model being installed on-site.</p>
        <p><xref rid="figure5" ref-type="fig">Figure 5</xref> shows an example of simulating the performance of models trained on data from hospitals M and N, but applied to live data of the medical stay of a sample patient in hospital H. The red vertical line indicates the point in time of the patient’s surgery. The three colored lines reflect the simulated model prediction over the course of the patient’s medical stay in hospital H, using models trained separately on data from hospital H, M, and N.</p>
        <p>In the presented case, postoperative delirium was confirmed by an independent evaluation—the Confusion Assessment Method for the ICU (CAM-ICU) [<xref ref-type="bibr" rid="ref20">20</xref>]—on the first postoperative day. The CAM-ICU evaluation was not included as a feature of our training model. Of the three models, the one trained at hospital H predicted the risk of delirium before surgery and identified an increased risk after surgery. The risk after surgery increased gradually when lab results and clinical entities were added. The models trained at the other hospitals both predicted the risk of delirium before surgery, but both failed to properly identify the severity of the risk after surgery.</p>
        <p>The detailed outcome of cross-hospital evaluation on prediction requests extracted from the live clinical workflow is provided in Table S7 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>. Prediction models for three different diseases (ie, delirium, sepsis, and AKI) were evaluated by comparing the AUROCs of different models at discharge. <xref rid="figure6" ref-type="fig">Figure 6</xref> depicts the performance degradation of a model when trained in a certain hospital and deployed in another hospital. For each use case, AUROC values in a row are compared to the white cell in the same row, which indicates within-hospital performance. For example, when the delirium model trained on data from hospital H was deployed in hospital M (91.2%, column 2, row 1; Table S7 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>), the AUROC was 3.2 percentage points lower as compared to its performance in hospital H (94.4%, column 1, row 1; Table S7 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). The largest performance degradation (–20.5 percentage points) was observed when the AKI model trained on data from hospital H was deployed in hospital M.</p>
        <p>On average, the AUROC was 8 percentage points lower when a model was deployed in a hospital other than where it was trained (from 94.2% to 86.3%).</p>
        <fig id="figure5" position="float">
          <label>Figure 5</label>
          <caption>
            <p>Delirium risk prediction of a sample patient during his medical stay based on data from the live electronic health record system. H: Herz- und Diabeteszentrum Nordrhein-Westfalen Bad Oeynhausen; M: Marienhospital Stuttgart; N: Medius Klinik Nürtingen.</p>
          </caption>
          <graphic xlink:href="jmir_v24i6e34295_fig5.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <fig id="figure6" position="float">
          <label>Figure 6</label>
          <caption>
            <p>Performance degradation of a model trained in a certain hospital (rows) and deployed in another hospital (columns). The table was generated from the  AUROC values from cross-hospital evaluation on the live data (Table S6 in Multimedia Appendix 1). AKI: acute kidney injury; AUROC: area under the receiver operating characteristic curve; H: Herz- und Diabeteszentrum Nordrhein-Westfalen Bad Oeynhausen; M: Marienhospital Stuttgart; N: Medius Klinik Nürtingen.</p>
          </caption>
          <graphic xlink:href="jmir_v24i6e34295_fig6.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Preliminary Evaluation of Clinical Soundness and Usefulness</title>
        <p>The prediction models were installed in the live EHR systems of three different hospitals. These models generated predictions and triggered alerts in a live clinical workflow. Those alerts were displayed in the production EHR system and are currently under the evaluation of domain experts. A quantitative evaluation of the impact on clinical outcomes has not yet been performed. Nevertheless, the preliminary evaluation made by the domain experts assures the correctness and effectiveness of the predictions. A case study has been conducted to evaluate the performance of the delirium prediction models installed in hospital H [<xref ref-type="bibr" rid="ref21">21</xref>]. Predictions made by the delirium risk prediction model following cardiac surgery were evaluated in the study. A cohort study investigating a larger population is also ongoing in the same hospital. The investigations identified that the prediction service could have an influence on anesthesia planning, as risk prediction is crucial for an early prevention strategy. The machine learning approach also improved postoperative care by enhanced screening efforts. In addition, the rest of this section presents our analysis of calibration and decision curves at hospital H, as well as our preliminary analysis on user feedback at hospital M.</p>
      </sec>
      <sec>
        <title>Calibration and Decision Curve Analysis</title>
        <p><xref rid="figure7" ref-type="fig">Figure 7</xref> shows the calibration and decision curve analysis for three use cases with the live data retrieved from hospital H. We first applied probability calibration [<xref ref-type="bibr" rid="ref22">22</xref>,<xref ref-type="bibr" rid="ref23">23</xref>] to generate calibration curves for each use case. The calibration curves plot the true frequency of the positive cases against its averaged predicted probability for each bin. We divided the probability into 10 bins. Predictions on the live data before and after probability calibration are shown. We used isotonic regression to perform the probability calibration. The calibration process used the first half of the live data, and the calibration curves and decision curves were generated using the second half of the live data. Due to the limited amount of available data, there are a few spikes in the calibrated curves. After the probability calibration, the decision curves [<xref ref-type="bibr" rid="ref24">24</xref>,<xref ref-type="bibr" rid="ref25">25</xref>] were generated to evaluate the net benefit of using the prediction models. The net benefits of the prediction models were compared with either “alert all patients” or “no alerts.” It can be observed that the prediction models were clinically useful when the threshold probability was below 90% for the AKI and sepsis use cases. For the delirium use case, the model had benefits when the threshold probability was below 70%.</p>
        <p><xref rid="figure8" ref-type="fig">Figure 8</xref> shows the decision curves for the prediction models trained at hospitals H and M on the sepsis use case. Both curves in <xref rid="figure8" ref-type="fig">Figure 8</xref> were generated using the live data from hospital H, and the predicated probabilities were both calibrated. It can be observed that the model trained at hospital H was superior compared with the model trained at hospital M.</p>
        <fig id="figure7" position="float">
          <label>Figure 7</label>
          <caption>
            <p>Calibration and decision curve analysis. The model and data were both from hospital H (Herz- und Diabeteszentrum Nordrhein-Westfalen Bad Oeynhausen). AKI: acute kidney injury.</p>
          </caption>
          <graphic xlink:href="jmir_v24i6e34295_fig7.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <fig id="figure8" position="float">
          <label>Figure 8</label>
          <caption>
            <p>Decision curve analysis for the sepsis use case. Models trained at hospitals H and M, both using the live data from hospital H, are compared. H: Herz- und Diabeteszentrum Nordrhein-Westfalen Bad Oeynhausen; M: Marienhospital Stuttgart.</p>
          </caption>
          <graphic xlink:href="jmir_v24i6e34295_fig8.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Preliminary Analysis of User Feedback</title>
        <p>When the prediction models were installed in the production EHR system, the end user was able to provide their feedback when they closed an alert. There were 134 feedback entries collected for the AKI use case at hospital M. More than one-third of the feedback entries (n=46, 34.3%) indicated that the users found the predictions useful. Details of the user feedback entries can be found in Figure S2 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p>
        <p>A total of 27.6% (37/134) of the alerts were considered to be false positives by the end users. This is a satisfactory result, considering the low incidence of AKI (838/8861, 9.46% at hospital M). Moreover, among 37 of those evaluated as false positive cases, 20 (54%) were already discharged and coded. Of these 20 discharged cases, 4 (20%) were actually coded as having AKI. This means that even if the physician disagrees with a prediction of high risk, there seems to still be a high risk that some patients will ultimately develop AKI, and our model can identify that risk.</p>
        <p>In 38.1% (51/134) of the cases, the end users were already aware of the risk of AKI raised by the alert. There were two main reasons for this. Firstly, there was a clear gap between the time that the alert was created and the time that the feedback was given when the alert was closed. Secondly, alerts were only displayed in departments where the prediction service was activated; if a patient was transferred from a department where the prediction service was not activated, there would not be any alert displayed there.</p>
      </sec>
    </sec>
    <sec sec-type="discussion">
      <title>Discussion</title>
      <sec>
        <title>Principal Findings</title>
        <p>The state of the art of machine learning development is to either design and train a single model and use it in different hospitals or design and train a specific model for a single hospital. We claim that defining a generic model design and training a specific instance of the model with data from a specific hospital has additional benefits for replicating the results. We observed performance degradation when a model was deployed in another hospital in our cross-hospital evaluation, a typical limitation of developing a single model for different hospitals. In the meantime, having a generic process and common model design to generate hospital-specific prediction models is a more robust solution. It resolves the intrinsic differences between different hospitals and guarantees sound performance at target hospitals. The evaluation of model performance in live clinical workflows assured the feasibility of such a generic approach, by checking the performance on three use cases at three different hospitals. In addition, by storing the logging data from live clinical workflows and having a common model design, the evaluation presented in this paper allows one to simulate the performance of a model in a live clinical workflow without it being installed on-site.</p>
      </sec>
      <sec>
        <title>Motivations</title>
        <p>Machine learning–based prediction models are closely tied to the data used in the training process. This dependency largely restricts the reusability of a prediction model in other hospitals. A generic model that delivers unbiased performance in different hospitals is what machine learning scientists and clinicians earnestly long for but also often fail to achieve.</p>
        <p>The prerequisite to generate a generic model that can be used in different hospitals is to achieve semantic interoperability that guarantees a common understanding between different EHR systems [<xref ref-type="bibr" rid="ref26">26</xref>,<xref ref-type="bibr" rid="ref27">27</xref>]. In order to achieve semantic interoperability, clinical terminologies need to be mapped onto a standard representation. However, a recent study [<xref ref-type="bibr" rid="ref28">28</xref>] also showed safety risks related to the use of standard terminologies, such as LOINC (Logical Observation Identifiers Names and Codes), for interoperability between organizations due to inaccurate mappings.</p>
        <p>In addition, a disease may have very different incidence rates in different hospitals due to the type and specialties of a hospital. Such a variety also results in different clinical workflows performed in different hospitals that determine the data the hospital records. A prediction model is, therefore, considered an algorithm that captures the knowledge and practice of the physicians of a hospital, by processing hospital-specific data that are presented in their specific representation. It is challenging to overcome the vulnerability of data shifts caused by diverse clinical workflows in different hospitals. Therefore, it is hard to maintain good performance when a model runs in a different hospital than the one within which it was trained, especially if the characteristics of the EHR data and the clinical workflow differ significantly. For example, the sepsis prediction of one particular vendor achieved satisfactory results in one hospital [<xref ref-type="bibr" rid="ref29">29</xref>], but it was substantially worse when evaluated in another hospital [<xref ref-type="bibr" rid="ref15">15</xref>].</p>
        <p>We also observed performance degradation when a model was deployed in other hospitals in our cross-hospital evaluation. Therefore, instead of delivering a generic prediction model to different hospitals, we designed a generic procedure for prediction model development and applied it to different hospitals. Having a generic process to generate hospital-specific prediction models is a more robust solution; it resolves the intrinsic differences between different hospitals.</p>
      </sec>
      <sec>
        <title>Strengths</title>
        <p>Evaluating prediction models in a live clinical workflow is crucial for validating their performances. To the best of our knowledge, we are the first to evaluate clinical prediction models on such a large scale in live clinical workflows. Such a thorough evaluation avoids overfitting to a certain disease or the settings of a particular hospital, thus allowing a fair, unbiased evaluation. The models deployed in the live clinical workflows delivered similar performances compared with those reported in our previous study [<xref ref-type="bibr" rid="ref9">9</xref>], which were evaluated using retrospective data.</p>
        <p>Sharing the same feature processing approach allows us to reuse the prediction requests by different prediction models. We, therefore, performed cross-hospital evaluation on three use cases in three different hospitals, mimicking the performance in a live clinical workflow rather than on retrospective data. To the best of our knowledge, this is the first study that performed cross-hospital evaluations on multiple use cases and simulated model performance in live clinical workflows.</p>
      </sec>
      <sec>
        <title>Limitations</title>
        <p>This study had some limitations. First, our model development and evaluation on metrics reported in this paper lacked a dynamic evaluation to predict the risk within a time window of event onset. For example, the most widely used diagnostic criterion for AKI is based on changes in serum creatinine, as defined by the Kidney Disease: Improving Global Outcomes (KDIGO) guidelines [<xref ref-type="bibr" rid="ref30">30</xref>]. Tomašev et al [<xref ref-type="bibr" rid="ref31">31</xref>] reported an AKI prediction model that predicts the AKI risk 48 hours before the KDIGO-defined event. In the three use cases presented in this paper, delirium was considered a mental health disease that normally does not have a precise time of onset. We have developed an AKI prediction model based on retrospective data at our development site using KDIGO events as labels. The models were not deployed in the production system; however, their performance at two hospitals on retrospective data is enclosed (Table S8 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). The AKI risk prediction curve of a sample patient during his medical stay is also provided (Figure S3 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). For the sepsis prediction, we did not yet perform such a dynamic evaluation due to the lack of both scalable and accurate indicators of documented or suspected infection. Nevertheless, the AUROC for sepsis at the end of the day of admission ranged between 86.9% and 88.5% in the live system at the three different hospitals, which assures a satisfactory performance.</p>
        <p>Second, although the metrics of the prediction models in the live clinical workflows were evaluated in different hospitals, the corresponding clinical outcomes in clinical practice are yet to be measured. Nevertheless, the preliminary clinical evaluation in hospital H affirms that there was a positive impact in the live clinical workflows, and a quantitative evaluation is scheduled as our next step of this work. The decision curve analysis and the preliminary analysis of user feedback also affirms the usefulness of our prediction models.</p>
        <p>Third, machine learning approaches that are used to generate and validate prediction models are always data hungry [<xref ref-type="bibr" rid="ref32">32</xref>]. Current external validation studies often suffer from small sample sizes compared with the large amount of predictor features [<xref ref-type="bibr" rid="ref33">33</xref>]. The sample size presented in this paper was also relatively small compared with the number of predictor features used in our prediction model. Nevertheless, we also argue that for diseases with low incidence, it is difficult to obtain a large number of positive samples. The three use cases presented in this paper were running in live EHR systems for more than half a year, which we consider to be a reasonable amount of time. In addition, we ran evaluations on three different use cases at three different hospitals, which helps to justify the outcomes.</p>
      </sec>
      <sec>
        <title>Future Directions</title>
        <p>Our future work will focus on evaluating the detailed clinical outcomes of prediction models in clinical practice. In addition, we will also evaluate the impact of different labeling strategies, such as defining AKI events with KDIGO criteria, in live systems.</p>
      </sec>
      <sec>
        <title>Conclusions</title>
        <p>In this study, we found consistent performance of models when evaluated on retrospective and live data, and performance differences were observed in the cross-hospital evaluations. This ensures that designing a generic process for model development, implementing that design in a calibration tool, and generating hospital-specific prediction models with a common model design is a valid approach that guarantees model performance in different hospitals.</p>
      </sec>
    </sec>
  </body>
  <back>
    <app-group>
      <supplementary-material id="app1">
        <label>Multimedia Appendix 1</label>
        <p>Supplementary materials.</p>
        <media xlink:href="jmir_v24i6e34295_app1.docx" xlink:title="DOCX File , 208 KB"/>
      </supplementary-material>
    </app-group>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">AI</term>
          <def>
            <p>artificial intelligence</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb2">AKI</term>
          <def>
            <p>acute kidney injury</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb3">AUROC</term>
          <def>
            <p>area under the receiver operating characteristic curve</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb4">BERT</term>
          <def>
            <p>bidirectional encoder representations from transformers</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb5">CAM-ICU</term>
          <def>
            <p>Confusion Assessment Method for the Intensive Care Unit</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb6">EHR</term>
          <def>
            <p>electronic health record</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb7">FHIR</term>
          <def>
            <p>Fast Healthcare Interoperability Resources</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb8">GenoMed4All</term>
          <def>
            <p>Genomics and Personalized Medicine for All Though Artificial Intelligence in Haematological Diseases</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb9">hospital H</term>
          <def>
            <p>Herz- und Diabeteszentrum Nordrhein-Westfalen Bad Oeynhausen</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb10">hospital M</term>
          <def>
            <p>Marienhospital Stuttgart</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb11">hospital N</term>
          <def>
            <p>Medius Klinik Nürtingen</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb12">ICU</term>
          <def>
            <p>intensive care unit</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb13">KDIGO</term>
          <def>
            <p>Kidney Disease: Improving Global Outcomes</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb14">LOINC</term>
          <def>
            <p>Logical Observation Identifiers Names and Codes</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb15">NLP</term>
          <def>
            <p>natural language processing</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb16">PERSIST</term>
          <def>
            <p>Patients-Centered Survivorship Care Plan After Cancer Treatments Based on Big Data and Artificial Intelligence Technologies</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <ack>
      <p>JF, NH, and VvD received funding from the ARGUS project (grant 100126059) from the Ruhr-Universität Bochum. Authors from Dedalus received funding from the European Union’s Horizon 2020 projects GenoMed4All (Genomics and Personalized Medicine for All Though Artificial Intelligence in Haematological Diseases; grant 101017549) and PERSIST (Patients-Centered Survivorship Care Plan After Cancer Treatments Based on Big Data and Artificial Intelligence Technologies; grant 875406). The authors would like to acknowledge Dieter Vanden Abeele and Nico Lapauw for the integration of the prediction services into the ORBIS EHR system and for preparing the logs, Corry Clybouw for proofreading and improving the presentation of the paper, and Marienhospital Stuttgart, Herz- und Diabeteszentrum Nordrhein-Westfalen Bad Oeynhausen, Alexianer Krefeld, and Medius Klinik Nürtingen for assisting with the model calibration and prediction model evaluation in their production systems.</p>
    </ack>
    <notes>
      <sec>
        <title>Data Availability</title>
        <p>The patient data used in this evaluation from the three hospitals cannot be made publicly available due to patient protection. The code used to evaluate the model performance and to run the cross-hospital evaluation is available at GitHub [<xref ref-type="bibr" rid="ref34">34</xref>].</p>
      </sec>
    </notes>
    <fn-group>
      <fn fn-type="con">
        <p>HS, KD, and LM conceptualized the study and designed the evaluation methods. MDH provided input to consolidate the study. HS and PCS performed the model evaluation. LM, JF, NH, and VvD provided the clinical perspective. RS coordinated the resources to perform the evaluation. KD and MDH supervised the process of model evaluation. MV, HS, KD, and LM defined the model design, and MV, JDB, HS, and KD developed the calibration tool accordingly. KD and JDB developed the solution architecture for the prediction service and its integration into the production system. HS wrote the original draft, and MDH and LM provided input as medical editors. All authors reviewed and edited the manuscript critically and approved its final version.</p>
      </fn>
      <fn fn-type="conflict">
        <p>None declared.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Esteva</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Robicquet</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Ramsundar</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Kuleshov</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>DePristo</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Chou</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Cui</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Corrado</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Thrun</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Dean</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>A guide to deep learning in healthcare</article-title>
          <source>Nat Med</source>
          <year>2019</year>
          <month>01</month>
          <volume>25</volume>
          <issue>1</issue>
          <fpage>24</fpage>
          <lpage>29</lpage>
          <pub-id pub-id-type="doi">10.1038/s41591-018-0316-z</pub-id>
          <pub-id pub-id-type="medline">30617335</pub-id>
          <pub-id pub-id-type="pii">10.1038/s41591-018-0316-z</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Cutillo</surname>
              <given-names>CM</given-names>
            </name>
            <name name-style="western">
              <surname>Sharma</surname>
              <given-names>KR</given-names>
            </name>
            <name name-style="western">
              <surname>Foschini</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Kundu</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Mackintosh</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Mandl</surname>
              <given-names>KD</given-names>
            </name>
            <collab>MI in Healthcare Workshop Working Group</collab>
          </person-group>
          <article-title>Machine intelligence in healthcare-perspectives on trustworthiness, explainability, usability, and transparency</article-title>
          <source>NPJ Digit Med</source>
          <year>2020</year>
          <volume>3</volume>
          <fpage>47</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1038/s41746-020-0254-2"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/s41746-020-0254-2</pub-id>
          <pub-id pub-id-type="medline">32258429</pub-id>
          <pub-id pub-id-type="pii">254</pub-id>
          <pub-id pub-id-type="pmcid">PMC7099019</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Rajkomar</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Dean</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Kohane</surname>
              <given-names>I</given-names>
            </name>
          </person-group>
          <article-title>Machine learning in medicine</article-title>
          <source>N Engl J Med</source>
          <year>2019</year>
          <month>04</month>
          <day>04</day>
          <volume>380</volume>
          <issue>14</issue>
          <fpage>1347</fpage>
          <lpage>1358</lpage>
          <pub-id pub-id-type="doi">10.1056/NEJMra1814259</pub-id>
          <pub-id pub-id-type="medline">30943338</pub-id>
          <pub-id pub-id-type="pii">10.1056/NEJMra1814259</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Goldstein</surname>
              <given-names>BA</given-names>
            </name>
            <name name-style="western">
              <surname>Navar</surname>
              <given-names>AM</given-names>
            </name>
            <name name-style="western">
              <surname>Pencina</surname>
              <given-names>MJ</given-names>
            </name>
            <name name-style="western">
              <surname>Ioannidis</surname>
              <given-names>JPA</given-names>
            </name>
          </person-group>
          <article-title>Opportunities and challenges in developing risk prediction models with electronic health records data: A systematic review</article-title>
          <source>J Am Med Inform Assoc</source>
          <year>2017</year>
          <month>01</month>
          <volume>24</volume>
          <issue>1</issue>
          <fpage>198</fpage>
          <lpage>208</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/27189013"/>
          </comment>
          <pub-id pub-id-type="doi">10.1093/jamia/ocw042</pub-id>
          <pub-id pub-id-type="medline">27189013</pub-id>
          <pub-id pub-id-type="pii">ocw042</pub-id>
          <pub-id pub-id-type="pmcid">PMC5201180</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Topol</surname>
              <given-names>EJ</given-names>
            </name>
          </person-group>
          <article-title>High-performance medicine: The convergence of human and artificial intelligence</article-title>
          <source>Nat Med</source>
          <year>2019</year>
          <month>01</month>
          <volume>25</volume>
          <issue>1</issue>
          <fpage>44</fpage>
          <lpage>56</lpage>
          <pub-id pub-id-type="doi">10.1038/s41591-018-0300-7</pub-id>
          <pub-id pub-id-type="medline">30617339</pub-id>
          <pub-id pub-id-type="pii">10.1038/s41591-018-0300-7</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref6">
        <label>6</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Rajkomar</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Oren</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Dai</surname>
              <given-names>AM</given-names>
            </name>
            <name name-style="western">
              <surname>Hajaj</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Hardt</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>PJ</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Marcus</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Sun</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Sundberg</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Yee</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Flores</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Duggan</surname>
              <given-names>GE</given-names>
            </name>
            <name name-style="western">
              <surname>Irvine</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Le</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Litsch</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Mossin</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Tansuwan</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Wexler</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Wilson</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Ludwig</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Volchenboum</surname>
              <given-names>SL</given-names>
            </name>
            <name name-style="western">
              <surname>Chou</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Pearson</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Madabushi</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Shah</surname>
              <given-names>NH</given-names>
            </name>
            <name name-style="western">
              <surname>Butte</surname>
              <given-names>AJ</given-names>
            </name>
            <name name-style="western">
              <surname>Howell</surname>
              <given-names>MD</given-names>
            </name>
            <name name-style="western">
              <surname>Cui</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Corrado</surname>
              <given-names>GS</given-names>
            </name>
            <name name-style="western">
              <surname>Dean</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Scalable and accurate deep learning with electronic health records</article-title>
          <source>NPJ Digit Med</source>
          <year>2018</year>
          <volume>1</volume>
          <fpage>18</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1038/s41746-018-0029-1"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/s41746-018-0029-1</pub-id>
          <pub-id pub-id-type="medline">31304302</pub-id>
          <pub-id pub-id-type="pii">29</pub-id>
          <pub-id pub-id-type="pmcid">PMC6550175</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref7">
        <label>7</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Mandel</surname>
              <given-names>JC</given-names>
            </name>
            <name name-style="western">
              <surname>Kreda</surname>
              <given-names>DA</given-names>
            </name>
            <name name-style="western">
              <surname>Mandl</surname>
              <given-names>KD</given-names>
            </name>
            <name name-style="western">
              <surname>Kohane</surname>
              <given-names>IS</given-names>
            </name>
            <name name-style="western">
              <surname>Ramoni</surname>
              <given-names>RB</given-names>
            </name>
          </person-group>
          <article-title>SMART on FHIR: A standards-based, interoperable apps platform for electronic health records</article-title>
          <source>J Am Med Inform Assoc</source>
          <year>2016</year>
          <month>09</month>
          <volume>23</volume>
          <issue>5</issue>
          <fpage>899</fpage>
          <lpage>908</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/26911829"/>
          </comment>
          <pub-id pub-id-type="doi">10.1093/jamia/ocv189</pub-id>
          <pub-id pub-id-type="medline">26911829</pub-id>
          <pub-id pub-id-type="pii">ocv189</pub-id>
          <pub-id pub-id-type="pmcid">PMC4997036</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref8">
        <label>8</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Churpek</surname>
              <given-names>MM</given-names>
            </name>
            <name name-style="western">
              <surname>Yuen</surname>
              <given-names>TC</given-names>
            </name>
            <name name-style="western">
              <surname>Winslow</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Robicsek</surname>
              <given-names>AA</given-names>
            </name>
            <name name-style="western">
              <surname>Meltzer</surname>
              <given-names>DO</given-names>
            </name>
            <name name-style="western">
              <surname>Gibbons</surname>
              <given-names>RD</given-names>
            </name>
            <name name-style="western">
              <surname>Edelson</surname>
              <given-names>DP</given-names>
            </name>
          </person-group>
          <article-title>Multicenter development and validation of a risk stratification tool for ward patients</article-title>
          <source>Am J Respir Crit Care Med</source>
          <year>2014</year>
          <month>09</month>
          <day>15</day>
          <volume>190</volume>
          <issue>6</issue>
          <fpage>649</fpage>
          <lpage>655</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/25089847"/>
          </comment>
          <pub-id pub-id-type="doi">10.1164/rccm.201406-1022OC</pub-id>
          <pub-id pub-id-type="medline">25089847</pub-id>
          <pub-id pub-id-type="pmcid">PMC4214112</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref9">
        <label>9</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Sun</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Depraetere</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Meesseman</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>De Roo</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Vanbiervliet</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>De Baerdemaeker</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Muys</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>von Dossow</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Hulde</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Szymanowsky</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>A scalable approach for developing clinical risk prediction applications in different hospitals</article-title>
          <source>J Biomed Inform</source>
          <year>2021</year>
          <month>06</month>
          <volume>118</volume>
          <fpage>103783</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://linkinghub.elsevier.com/retrieve/pii/S1532-0464(21)00112-X"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.jbi.2021.103783</pub-id>
          <pub-id pub-id-type="medline">33887456</pub-id>
          <pub-id pub-id-type="pii">S1532-0464(21)00112-X</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref10">
        <label>10</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>He</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Baxter</surname>
              <given-names>SL</given-names>
            </name>
            <name name-style="western">
              <surname>Xu</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Xu</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Zhou</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>The practical implementation of artificial intelligence technologies in medicine</article-title>
          <source>Nat Med</source>
          <year>2019</year>
          <month>01</month>
          <volume>25</volume>
          <issue>1</issue>
          <fpage>30</fpage>
          <lpage>36</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/30617336"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/s41591-018-0307-0</pub-id>
          <pub-id pub-id-type="medline">30617336</pub-id>
          <pub-id pub-id-type="pii">10.1038/s41591-018-0307-0</pub-id>
          <pub-id pub-id-type="pmcid">PMC6995276</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref11">
        <label>11</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Domalpally</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Channa</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>Real-world validation of artificial intelligence algorithms for ophthalmic imaging</article-title>
          <source>Lancet Digit Health</source>
          <year>2021</year>
          <month>08</month>
          <volume>3</volume>
          <issue>8</issue>
          <fpage>e463</fpage>
          <lpage>e464</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://linkinghub.elsevier.com/retrieve/pii/S2589-7500(21)00140-0"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/S2589-7500(21)00140-0</pub-id>
          <pub-id pub-id-type="medline">34325850</pub-id>
          <pub-id pub-id-type="pii">S2589-7500(21)00140-0</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref12">
        <label>12</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>MY</given-names>
            </name>
            <name name-style="western">
              <surname>Park</surname>
              <given-names>UJ</given-names>
            </name>
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>HT</given-names>
            </name>
            <name name-style="western">
              <surname>Cho</surname>
              <given-names>WH</given-names>
            </name>
          </person-group>
          <article-title>DELirium Prediction based on Hospital Information (Delphi) in general surgery patients</article-title>
          <source>Medicine (Baltimore)</source>
          <year>2016</year>
          <month>03</month>
          <volume>95</volume>
          <issue>12</issue>
          <fpage>e3072</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1097/MD.0000000000003072"/>
          </comment>
          <pub-id pub-id-type="doi">10.1097/MD.0000000000003072</pub-id>
          <pub-id pub-id-type="medline">27015177</pub-id>
          <pub-id pub-id-type="pii">00005792-201603220-00014</pub-id>
          <pub-id pub-id-type="pmcid">PMC4998372</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref13">
        <label>13</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wong</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Young</surname>
              <given-names>AT</given-names>
            </name>
            <name name-style="western">
              <surname>Liang</surname>
              <given-names>AS</given-names>
            </name>
            <name name-style="western">
              <surname>Gonzales</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Douglas</surname>
              <given-names>VC</given-names>
            </name>
            <name name-style="western">
              <surname>Hadley</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>Development and validation of an electronic health record-based machine learning model to estimate delirium risk in newly hospitalized patients without known cognitive impairment</article-title>
          <source>JAMA Netw Open</source>
          <year>2018</year>
          <month>08</month>
          <day>03</day>
          <volume>1</volume>
          <issue>4</issue>
          <fpage>e181018</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://jamanetwork.com/journals/jamanetworkopen/fullarticle/10.1001/jamanetworkopen.2018.1018"/>
          </comment>
          <pub-id pub-id-type="doi">10.1001/jamanetworkopen.2018.1018</pub-id>
          <pub-id pub-id-type="medline">30646095</pub-id>
          <pub-id pub-id-type="pii">2695078</pub-id>
          <pub-id pub-id-type="pmcid">PMC6324291</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref14">
        <label>14</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Jauk</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Kramer</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Großauer</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Rienmüller</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Avian</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Berghold</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Leodolter</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Schulz</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Risk prediction of delirium in hospitalized patients using machine learning: An implementation and prospective evaluation study</article-title>
          <source>J Am Med Inform Assoc</source>
          <year>2020</year>
          <month>07</month>
          <day>01</day>
          <volume>27</volume>
          <issue>9</issue>
          <fpage>1383</fpage>
          <lpage>1392</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/32968811"/>
          </comment>
          <pub-id pub-id-type="doi">10.1093/jamia/ocaa113</pub-id>
          <pub-id pub-id-type="medline">32968811</pub-id>
          <pub-id pub-id-type="pii">5910737</pub-id>
          <pub-id pub-id-type="pmcid">PMC7647341</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref15">
        <label>15</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wong</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Otles</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Donnelly</surname>
              <given-names>JP</given-names>
            </name>
            <name name-style="western">
              <surname>Krumm</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>McCullough</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>DeTroyer-Cooley</surname>
              <given-names>O</given-names>
            </name>
            <name name-style="western">
              <surname>Pestrue</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Phillips</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Konye</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Penoza</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Ghous</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Singh</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>External validation of a widely implemented proprietary sepsis prediction model in hospitalized patients</article-title>
          <source>JAMA Intern Med</source>
          <year>2021</year>
          <month>08</month>
          <day>01</day>
          <volume>181</volume>
          <issue>8</issue>
          <fpage>1065</fpage>
          <lpage>1070</lpage>
          <pub-id pub-id-type="doi">10.1001/jamainternmed.2021.2626</pub-id>
          <pub-id pub-id-type="medline">34152373</pub-id>
          <pub-id pub-id-type="pii">2781307</pub-id>
          <pub-id pub-id-type="pmcid">PMC8218233</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref16">
        <label>16</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Daneshjou</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Ouyang</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Ho</surname>
              <given-names>DE</given-names>
            </name>
            <name name-style="western">
              <surname>Zou</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>How medical AI devices are evaluated: Limitations and recommendations from an analysis of FDA approvals</article-title>
          <source>Nat Med</source>
          <year>2021</year>
          <month>04</month>
          <volume>27</volume>
          <issue>4</issue>
          <fpage>582</fpage>
          <lpage>584</lpage>
          <pub-id pub-id-type="doi">10.1038/s41591-021-01312-x</pub-id>
          <pub-id pub-id-type="medline">33820998</pub-id>
          <pub-id pub-id-type="pii">10.1038/s41591-021-01312-x</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref17">
        <label>17</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Devlin</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Chang</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Toutanova</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>Bert: Pre-training of deep bidirectional transformers for language understanding</article-title>
          <source>ArXiv. Preprint posted online on May 24, 2019</source>
          <year>2022</year>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/1810.04805"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref18">
        <label>18</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Vaswani</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Shazeer</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Parmar</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Uszkoreit</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Jones</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Gomez</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Kaiser</surname>
              <given-names>Ł</given-names>
            </name>
            <name name-style="western">
              <surname>Polosukhin</surname>
              <given-names>I</given-names>
            </name>
          </person-group>
          <article-title>Attention is all you need</article-title>
          <source>Proceedings of the 31st International Conference on Neural Information Processing Systems</source>
          <year>2017</year>
          <conf-name>The 31st International Conference on Neural Information Processing Systems</conf-name>
          <conf-date>December 4-9, 2017</conf-date>
          <conf-loc>Long Beach, CA</conf-loc>
          <fpage>6000</fpage>
          <lpage>6010</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://dl.acm.org/doi/pdf/10.5555/3295222.3295349"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref19">
        <label>19</label>
        <nlm-citation citation-type="web">
          <article-title>JSON File logging driver</article-title>
          <source>Docker Docs</source>
          <access-date>2022-04-05</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://docs.docker.com/config/containers/logging/json-file/">https://docs.docker.com/config/containers/logging/json-file/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref20">
        <label>20</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Khan</surname>
              <given-names>BA</given-names>
            </name>
            <name name-style="western">
              <surname>Perkins</surname>
              <given-names>AJ</given-names>
            </name>
            <name name-style="western">
              <surname>Gao</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Hui</surname>
              <given-names>SL</given-names>
            </name>
            <name name-style="western">
              <surname>Campbell</surname>
              <given-names>NL</given-names>
            </name>
            <name name-style="western">
              <surname>Farber</surname>
              <given-names>MO</given-names>
            </name>
            <name name-style="western">
              <surname>Chlan</surname>
              <given-names>LL</given-names>
            </name>
            <name name-style="western">
              <surname>Boustani</surname>
              <given-names>MA</given-names>
            </name>
          </person-group>
          <article-title>The Confusion Assessment Method for the ICU-7 delirium severity scale</article-title>
          <source>Crit Care Med</source>
          <year>2017</year>
          <volume>45</volume>
          <issue>5</issue>
          <fpage>851</fpage>
          <lpage>857</lpage>
          <pub-id pub-id-type="doi">10.1097/ccm.0000000000002368</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref21">
        <label>21</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Fliegenschmidt</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Hulde</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Preising</surname>
              <given-names>MG</given-names>
            </name>
            <name name-style="western">
              <surname>Ruggeri</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Szymanowski</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Meesseman</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Sun</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>von Dossow</surname>
              <given-names>V</given-names>
            </name>
          </person-group>
          <article-title>Artificial intelligence predicts delirium following cardiac surgery: A case study</article-title>
          <source>J Clin Anesth</source>
          <year>2021</year>
          <month>12</month>
          <volume>75</volume>
          <fpage>110473</fpage>
          <pub-id pub-id-type="doi">10.1016/j.jclinane.2021.110473</pub-id>
          <pub-id pub-id-type="medline">34333447</pub-id>
          <pub-id pub-id-type="pii">S0952-8180(21)00315-9</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref22">
        <label>22</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Riley</surname>
              <given-names>RD</given-names>
            </name>
            <name name-style="western">
              <surname>Ensor</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Snell</surname>
              <given-names>KIE</given-names>
            </name>
            <name name-style="western">
              <surname>Debray</surname>
              <given-names>TPA</given-names>
            </name>
            <name name-style="western">
              <surname>Altman</surname>
              <given-names>DG</given-names>
            </name>
            <name name-style="western">
              <surname>Moons</surname>
              <given-names>KGM</given-names>
            </name>
            <name name-style="western">
              <surname>Collins</surname>
              <given-names>GS</given-names>
            </name>
          </person-group>
          <article-title>External validation of clinical prediction models using big datasets from e-health records or IPD meta-analysis: Opportunities and challenges</article-title>
          <source>BMJ</source>
          <year>2016</year>
          <month>06</month>
          <day>22</day>
          <volume>353</volume>
          <fpage>i3140</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://www.bmj.com/lookup/pmidlookup?view=long&#38;pmid=27334381"/>
          </comment>
          <pub-id pub-id-type="doi">10.1136/bmj.i3140</pub-id>
          <pub-id pub-id-type="medline">27334381</pub-id>
          <pub-id pub-id-type="pmcid">PMC4916924</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref23">
        <label>23</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Niculescu-Mizil</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Caruana</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>Predicting good probabilities with supervised learning</article-title>
          <source>Proceedings of the 22nd International Conference on Machine Learning</source>
          <year>2005</year>
          <conf-name>The 22nd International Conference on Machine Learning</conf-name>
          <conf-date>August 7-11, 2005</conf-date>
          <conf-loc>Bonn, Germany</conf-loc>
          <fpage>625</fpage>
          <lpage>632</lpage>
          <pub-id pub-id-type="doi">10.1145/1102351.1102430</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref24">
        <label>24</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Vickers</surname>
              <given-names>AJ</given-names>
            </name>
            <name name-style="western">
              <surname>Elkin</surname>
              <given-names>EB</given-names>
            </name>
          </person-group>
          <article-title>Decision curve analysis: A novel method for evaluating prediction models</article-title>
          <source>Med Decis Making</source>
          <year>2006</year>
          <volume>26</volume>
          <issue>6</issue>
          <fpage>565</fpage>
          <lpage>574</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/17099194"/>
          </comment>
          <pub-id pub-id-type="doi">10.1177/0272989X06295361</pub-id>
          <pub-id pub-id-type="medline">17099194</pub-id>
          <pub-id pub-id-type="pii">26/6/565</pub-id>
          <pub-id pub-id-type="pmcid">PMC2577036</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref25">
        <label>25</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Vickers</surname>
              <given-names>AJ</given-names>
            </name>
            <name name-style="western">
              <surname>van Calster</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Steyerberg</surname>
              <given-names>EW</given-names>
            </name>
          </person-group>
          <article-title>A simple, step-by-step guide to interpreting decision curve analysis</article-title>
          <source>Diagn Progn Res</source>
          <year>2019</year>
          <volume>3</volume>
          <fpage>18</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/31592444"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/s41512-019-0064-7</pub-id>
          <pub-id pub-id-type="medline">31592444</pub-id>
          <pub-id pub-id-type="pii">64</pub-id>
          <pub-id pub-id-type="pmcid">PMC6777022</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref26">
        <label>26</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Sun</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Depraetere</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>De Roo</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Mels</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>De Vloed</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Twagirumukiza</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Colaert</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>Semantic processing of EHR data for clinical research</article-title>
          <source>J Biomed Inform</source>
          <year>2015</year>
          <month>12</month>
          <volume>58</volume>
          <fpage>247</fpage>
          <lpage>259</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://linkinghub.elsevier.com/retrieve/pii/S1532-0464(15)00231-2"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.jbi.2015.10.009</pub-id>
          <pub-id pub-id-type="medline">26515501</pub-id>
          <pub-id pub-id-type="pii">S1532-0464(15)00231-2</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref27">
        <label>27</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Bhartiya</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Mehrotra</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Girdhar</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Issues in achieving complete interoperability while sharing electronic health records</article-title>
          <source>Procedia Comput Sci</source>
          <year>2016</year>
          <volume>78</volume>
          <fpage>192</fpage>
          <lpage>198</lpage>
          <pub-id pub-id-type="doi">10.1016/j.procs.2016.02.033</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref28">
        <label>28</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Carter</surname>
              <given-names>AB</given-names>
            </name>
            <name name-style="western">
              <surname>de Baca</surname>
              <given-names>ME</given-names>
            </name>
            <name name-style="western">
              <surname>Luu</surname>
              <given-names>HS</given-names>
            </name>
            <name name-style="western">
              <surname>Campbell</surname>
              <given-names>WS</given-names>
            </name>
            <name name-style="western">
              <surname>Stram</surname>
              <given-names>MN</given-names>
            </name>
          </person-group>
          <article-title>Use of LOINC for interoperability between organisations poses a risk to safety</article-title>
          <source>Lancet Digit Health</source>
          <year>2020</year>
          <month>11</month>
          <volume>2</volume>
          <issue>11</issue>
          <fpage>e569</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://linkinghub.elsevier.com/retrieve/pii/S2589-7500(20)30244-2"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/S2589-7500(20)30244-2</pub-id>
          <pub-id pub-id-type="medline">33328084</pub-id>
          <pub-id pub-id-type="pii">S2589-7500(20)30244-2</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref29">
        <label>29</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Bennett</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Russell</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>King</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Schilling</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Voong</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Rogers</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Adrian</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Bruce</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Ghosh</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>Accuracy of the Epic sepsis prediction model in a regional health system</article-title>
          <source>ArXiv. Preprint posted online on February 19, 2019</source>
          <year>2022</year>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/1902.07276"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref30">
        <label>30</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Khwaja</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>KDIGO clinical practice guidelines for acute kidney injury</article-title>
          <source>Nephron Clin Pract</source>
          <year>2012</year>
          <volume>120</volume>
          <issue>4</issue>
          <fpage>c179</fpage>
          <lpage>c184</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.karger.com?DOI=10.1159/000339789"/>
          </comment>
          <pub-id pub-id-type="doi">10.1159/000339789</pub-id>
          <pub-id pub-id-type="medline">22890468</pub-id>
          <pub-id pub-id-type="pii">000339789</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref31">
        <label>31</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Tomašev</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Glorot</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Rae</surname>
              <given-names>JW</given-names>
            </name>
            <name name-style="western">
              <surname>Zielinski</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Askham</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Saraiva</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Mottram</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Meyer</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Ravuri</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Protsyuk</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Connell</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Hughes</surname>
              <given-names>CO</given-names>
            </name>
            <name name-style="western">
              <surname>Karthikesalingam</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Cornebise</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Montgomery</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Rees</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Laing</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Baker</surname>
              <given-names>CR</given-names>
            </name>
            <name name-style="western">
              <surname>Peterson</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Reeves</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Hassabis</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>King</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Suleyman</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Back</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Nielson</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Ledsam</surname>
              <given-names>JR</given-names>
            </name>
            <name name-style="western">
              <surname>Mohamed</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>A clinically applicable approach to continuous prediction of future acute kidney injury</article-title>
          <source>Nature</source>
          <year>2019</year>
          <month>08</month>
          <volume>572</volume>
          <issue>7767</issue>
          <fpage>116</fpage>
          <lpage>119</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/31367026"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/s41586-019-1390-1</pub-id>
          <pub-id pub-id-type="medline">31367026</pub-id>
          <pub-id pub-id-type="pii">10.1038/s41586-019-1390-1</pub-id>
          <pub-id pub-id-type="pmcid">PMC6722431</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref32">
        <label>32</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>van der Ploeg</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Austin</surname>
              <given-names>PC</given-names>
            </name>
            <name name-style="western">
              <surname>Steyerberg</surname>
              <given-names>EW</given-names>
            </name>
          </person-group>
          <article-title>Modern modelling techniques are data hungry: A simulation study for predicting dichotomous endpoints</article-title>
          <source>BMC Med Res Methodol</source>
          <year>2014</year>
          <month>12</month>
          <day>22</day>
          <volume>14</volume>
          <fpage>137</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://bmcmedresmethodol.biomedcentral.com/articles/10.1186/1471-2288-14-137"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/1471-2288-14-137</pub-id>
          <pub-id pub-id-type="medline">25532820</pub-id>
          <pub-id pub-id-type="pii">1471-2288-14-137</pub-id>
          <pub-id pub-id-type="pmcid">PMC4289553</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref33">
        <label>33</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Riley</surname>
              <given-names>RD</given-names>
            </name>
            <name name-style="western">
              <surname>Debray</surname>
              <given-names>TPA</given-names>
            </name>
            <name name-style="western">
              <surname>Collins</surname>
              <given-names>GS</given-names>
            </name>
            <name name-style="western">
              <surname>Archer</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Ensor</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>van Smeden</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Snell</surname>
              <given-names>KIE</given-names>
            </name>
          </person-group>
          <article-title>Minimum sample size for external validation of a clinical prediction model with a binary outcome</article-title>
          <source>Stat Med</source>
          <year>2021</year>
          <month>08</month>
          <day>30</day>
          <volume>40</volume>
          <issue>19</issue>
          <fpage>4230</fpage>
          <lpage>4251</lpage>
          <pub-id pub-id-type="doi">10.1002/sim.9025</pub-id>
          <pub-id pub-id-type="medline">34031906</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref34">
        <label>34</label>
        <nlm-citation citation-type="web">
          <article-title>Evaluate ML models at different hospitals</article-title>
          <source>GitHub</source>
          <access-date>2022-05-10</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://github.com/patriciacs1994/Evaluate-ML-models-at-different-hospitals">https://github.com/patriciacs1994/Evaluate-ML-models-at-different-hospitals</ext-link>
          </comment>
        </nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
