<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">J Med Internet Res</journal-id><journal-id journal-id-type="publisher-id">jmir</journal-id><journal-id journal-id-type="index">1</journal-id><journal-title>Journal of Medical Internet Research</journal-title><abbrev-journal-title>J Med Internet Res</abbrev-journal-title><issn pub-type="epub">1438-8871</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v27i1e71297</article-id><article-id pub-id-type="doi">10.2196/71297</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Privacy-by-Design Approach to Generate Two Virtual Clinical Trials for Multiple Sclerosis and Release Them as Open Datasets: Evaluation Study</article-title></title-group><contrib-group><contrib contrib-type="author"><name name-style="western"><surname>Demuth</surname><given-names>Stanislas</given-names></name><degrees>MD, PhD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Rousseau</surname><given-names>Olivia</given-names></name><degrees>MSc</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Faddeenkov</surname><given-names>Igor</given-names></name><degrees>MSc</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Paris</surname><given-names>Julien</given-names></name><degrees>MSc</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>De S&#x00E8;ze</surname><given-names>J&#x00E9;r&#x00F4;me</given-names></name><degrees>MD, PhD</degrees><xref ref-type="aff" rid="aff2">2</xref><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Baciotti</surname><given-names>B&#x00E9;atrice</given-names></name><degrees>MIM</degrees><xref ref-type="aff" rid="aff4">4</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Payet</surname><given-names>Marianne</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff5">5</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Guillaudeux</surname><given-names>Morgan</given-names></name><degrees>MSc</degrees><xref ref-type="aff" rid="aff6">6</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Barreteau</surname><given-names>Alban-F&#x00E9;lix</given-names></name><degrees>MSc</degrees><xref ref-type="aff" rid="aff6">6</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Laplaud</surname><given-names>David</given-names></name><degrees>MD, PhD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff7">7</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Edan</surname><given-names>Gilles</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff8">8</xref></contrib><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Gourraud</surname><given-names>Pierre-Antoine</given-names></name><degrees>MPH, PhD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff9">9</xref></contrib></contrib-group><aff id="aff1"><institution>Center for Research in Transplantation and Translational Immunology, Institut national de la sant&#x00E9; et de la recherche m&#x00E9;dicale (INSERM), Nantes Universit&#x00E9;</institution><addr-line>30 boulevard Jean Monnet</addr-line><addr-line>Nantes</addr-line><country>France</country></aff><aff id="aff2"><institution>Institut national de la sant&#x00E9; et de la recherche m&#x00E9;dicale (INSERM) 1434, Clinical investigation center, University Hospital of Strasbourg</institution><addr-line>Strasbourg</addr-line><country>France</country></aff><aff id="aff3"><institution>Department of Neurology, University Hospital of Strasbourg</institution><addr-line>Strasbourg</addr-line><country>France</country></aff><aff id="aff4"><institution>Biogen France S.A.S</institution><addr-line>Paris</addr-line><country>France</country></aff><aff id="aff5"><institution>Neurology, Merck Sant&#x00E9; S.A.S., an affiliate of Merck KGaA</institution><addr-line>Lyon</addr-line><country>France</country></aff><aff id="aff6"><institution>Octopize, Mimethik Data</institution><addr-line>Nantes</addr-line><country>France</country></aff><aff id="aff7"><institution>Department of Neurology, University Hospital of Nantes</institution><addr-line>Nantes</addr-line><country>France</country></aff><aff id="aff8"><institution>Department of Neurology, University Hospital of Rennes</institution><addr-line>Rennes</addr-line><country>France</country></aff><aff id="aff9"><institution>Data Clinic, Department of Public Health, University Hospital of Nantes</institution><addr-line>Nantes</addr-line><country>France</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Cardoso</surname><given-names>Taiane de Azevedo</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Hashemi</surname><given-names>Atiye</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Reinoso</surname><given-names>Imanol Isasa</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Pierre-Antoine Gourraud, MPH, PhD, Center for Research in Transplantation and Translational Immunology, Institut national de la sant&#x00E9; et de la recherche m&#x00E9;dicale (INSERM), Nantes Universit&#x00E9;, 30 boulevard Jean Monnet, Nantes, 44093, France, 33 (0) 240087410; <email>pierre-antoine.gourraud@univ-nantes.fr</email></corresp></author-notes><pub-date pub-type="collection"><year>2025</year></pub-date><pub-date pub-type="epub"><day>1</day><month>10</month><year>2025</year></pub-date><volume>27</volume><elocation-id>e71297</elocation-id><history><date date-type="received"><day>16</day><month>01</month><year>2025</year></date><date date-type="rev-recd"><day>22</day><month>06</month><year>2025</year></date><date date-type="accepted"><day>23</day><month>06</month><year>2025</year></date></history><copyright-statement>&#x00A9; Stanislas Demuth, Olivia Rousseau, Igor Faddeenkov, Julien Paris, J&#x00E9;r&#x00F4;me De S&#x00E8;ze, B&#x00E9;atrice Baciotti, Marianne Payet, Morgan Guillaudeux, Alban-F&#x00E9;lix Barreteau, David Laplaud, Gilles Edan, Pierre-Antoine Gourraud. Originally published in the Journal of Medical Internet Research (<ext-link ext-link-type="uri" xlink:href="https://www.jmir.org">https://www.jmir.org</ext-link>), 1.10.2025. </copyright-statement><copyright-year>2025</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in the Journal of Medical Internet Research (ISSN 1438-8871), is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://www.jmir.org/">https://www.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://www.jmir.org/2025/1/e71297"/><abstract><sec><title>Background</title><p>Sharing information derived from individual patient data is restricted by regulatory frameworks due to privacy concerns. Generative artificial intelligence can generate shareable virtual patient populations as proxies for sensitive reference datasets. Explicit demonstration of privacy is demanded.</p></sec><sec><title>Objective</title><p>This study evaluated whether a privacy-by-design technique called &#x201C;avatars&#x201D; can generate synthetic datasets replicating all reported information from randomized clinical trials (RCTs).</p></sec><sec sec-type="methods"><title>Methods</title><p>We generated 2160 synthetic datasets from two phase 3 RCTs for patients with multiple sclerosis (NCT00213135 and NCT00906399; n=865 and 1516 patients) with different configurations to select one synthetic dataset with optimal privacy and utility for each. Several privacy metrics were computed, including protection against distance-based membership inference attacks. We assessed fidelity by comparing variable distributions and assessed utility by checking that all end points reported in the publications had the same effect directions, were within the reported 95% CIs, and had the same statistical significance.</p></sec><sec sec-type="results"><title>Results</title><p>Protection against membership inference attacks was the hardest privacy metric to optimize, but the technique yielded robust privacy and replication of the primary end points (in 72.5% and 80.8% of the 1080 generated datasets). Utility was uneven across the variables and end points, such that information about some end points could not be captured. With optimized generation configurations, we selected one dataset from each RCT replicating all efficacy end points of the placebo and approved treatment arms while maintaining satisfactory privacy (hidden rate: 85.0% and 93.2%).</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>Generating synthetic RCT datasets replicating primary and secondary efficacy end points is possible while achieving a satisfactory and explicit level of privacy. To show the potential of this method to unlock health data sharing, we released both placebo arms as open datasets.</p></sec></abstract><kwd-group><kwd>synthetic data</kwd><kwd>privacy</kwd><kwd>multiple sclerosis</kwd><kwd>anonymization</kwd><kwd>randomized clinical trial</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><sec id="s1-1"><title>Background</title><p>Medical practices are becoming increasingly data-driven, as empirical evidence is sought to inform all clinical decisions. While studies analyzing real-world data from electronic health records provide real-world evidence [<xref ref-type="bibr" rid="ref1">1</xref>], randomized clinical trial (RCT) data provide the highest level of evidence to guide medical practices, as this methodology approaches experimental settings. Within the standard clinical development pipeline of drugs, phase 3 RCTs are the largest-scale and most critical studies. Their primary end points provide regulatory evidence to approve new treatments on the market, while secondary end points and post hoc subgroup analyses, although not conclusive, provide high-quality information to generate hypotheses [<xref ref-type="bibr" rid="ref2">2</xref>]. RCT data are classically accessible through credentials on data-sharing platforms (eg, Vivli.org [<xref ref-type="bibr" rid="ref3">3</xref>], ClinicalStudyDataRequest.com [<xref ref-type="bibr" rid="ref4">4</xref>]) and analyzed in closed virtual work environments. Their accessibility is conditioned on a predefined analysis plan, which must be designed blindly. Individual patient data (IPD) from RCTs can be used for feasibility studies, estimating sample sizes necessary for RCTs, indirect treatment comparisons [<xref ref-type="bibr" rid="ref5">5</xref>,<xref ref-type="bibr" rid="ref6">6</xref>], as learning datasets for predictive model development, or as external control arms for clinical trials [<xref ref-type="bibr" rid="ref7">7</xref>]. As such, sharing RCT data as open datasets has been advocated by European regulators [<xref ref-type="bibr" rid="ref8">8</xref>], but the technical implementation standards for such policies are currently lacking.</p><p>The use and sharing of health data for clinical research are restricted by regulatory frameworks due to privacy concerns (eg, the General Data Protection Regulation in Europe and the Health Insurance Portability and Accountability Act in the United States). Privacy is commonly addressed by enforcing the usage control through credentialed access and data deidentifying (ie, removing direct identifiers), yielding pseudonymous datasets. However, this does not prevent indirect reidentification by unique combinations of variables [<xref ref-type="bibr" rid="ref9">9</xref>,<xref ref-type="bibr" rid="ref10">10</xref>]. For the French data protection board (Commission Nationale de l&#x2019;informatique et des libert&#x00E9;s; CNIL), truly anonymous data must demonstrate the impossibility of linkage to the originating person [<xref ref-type="bibr" rid="ref11">11</xref>]. As conceptual guidance, 3 anonymization criteria have been postulated by the European Data Protection Board [<xref ref-type="bibr" rid="ref9">9</xref>] and integrated into the General Data Protection Regulation: (1) singling out (ie, unique identity disclosure), which is the capacity to reidentify a person from the uniqueness of records in a dataset; (2) linkability, which is the ability to link records concerning the same person across different databases; and (3) inference (ie, sensitive attribute disclosure), which is the possibility to deduce sensitive information about a person from the dataset.</p><p>Synthetic data are computationally generated individual observations created using a purpose-built mathematical model or algorithm [<xref ref-type="bibr" rid="ref12">12</xref>]. Their most disseminated use case is digital content creation (images or text) using generative artificial intelligence models such as generative adversarial networks (GANs) [<xref ref-type="bibr" rid="ref13">13</xref>] or large language models [<xref ref-type="bibr" rid="ref14">14</xref>]. In medicine, model-based generators typically rely on GANs or variational autoencoder architectures and are commonly used for data augmentation or privacy enhancement [<xref ref-type="bibr" rid="ref15">15</xref>]. The utility of synthetic datasets may be assessed using fidelity (ie, similarity) metrics and generator robustness [<xref ref-type="bibr" rid="ref16">16</xref>]. More specifically, analytical utility stems from the veracity of the information, assessed by replicating aggregated results. The model footprint reflects the complexity of the model [<xref ref-type="bibr" rid="ref17">17</xref>]. As synthetic datasets are computer-generated rather than collected from real persons, they are assumed to be anonymous by design. Thus, they appear as an alternative to share the information of sensitive datasets by representing it as a set of virtual patients instead of as the mathematical formula of a predictive model. However, there is concern about privacy leakage due to the individual granularity of synthetic datasets [<xref ref-type="bibr" rid="ref18">18</xref>-<xref ref-type="bibr" rid="ref20">20</xref>]. Hence, there is a growing demand to explicitly assess privacy using quantitative metrics [<xref ref-type="bibr" rid="ref11">11</xref>,<xref ref-type="bibr" rid="ref21">21</xref>].</p></sec><sec id="s1-2"><title>State of the Art</title><p>The field of virtual RCTs originally aimed to simulate the effect of new treatments with individual-level modeling [<xref ref-type="bibr" rid="ref22">22</xref>]. This requires biomechanical models and has been achieved, for instance, in radiology in cross-sectional settings by the Virtual Imaging Clinical Trials for Regulatory Evaluation, which tested 2 mammography modalities on simulated images through a physics-based model of x-ray transmission and simulated breast cancer lesions [<xref ref-type="bibr" rid="ref23">23</xref>]. An agent-based simulation of the immune system activity with the multiple sclerosis (MS) TreatSim approach has been proposed in MS and could replicate the primary end point of the AFFIRM trial [<xref ref-type="bibr" rid="ref24">24</xref>]. Yet, drug development with biomechanistic modeling has not been achieved at the level of the whole organism.</p><p>The generation of virtual RCTs through statistical modeling aims to capture the information of reference datasets at the population level and then use the model generatively to yield synthetic IPD replicating the statistical behavior of reference IPD. The proposed use cases include providing technical stakeholders with mock data generated from metadata to explore standard data models such as CDISC (Clinical Data Interchange Standards Consortium) [<xref ref-type="bibr" rid="ref25">25</xref>]. Other works propose privacy enhancement for data sharing [<xref ref-type="bibr" rid="ref26">26</xref>], data augmentation to overcome insufficient patient accrual [<xref ref-type="bibr" rid="ref27">27</xref>], or &#x201C;synthetic control arms&#x201D; [<xref ref-type="bibr" rid="ref28">28</xref>]. However, this last term has mostly been used so far to designate external control arms of matched IPD from real-world data, which is closer to the field of clinical trials emulation than computer-generated data [<xref ref-type="bibr" rid="ref29">29</xref>,<xref ref-type="bibr" rid="ref30">30</xref>].</p><p>Synthetic data generators typically take one modality of raw data (eg, images) or a single-table tabular dataset as a reference [<xref ref-type="bibr" rid="ref31">31</xref>]. Yet, health datasets have more complex data structures [<xref ref-type="bibr" rid="ref32">32</xref>]. Electronic health record&#x2013;derived data are longitudinal with a document data model, requiring time-series models [<xref ref-type="bibr" rid="ref33">33</xref>]. Graph autoencoders could generate multitable datasets by modeling patient trajectories as directed acyclic graphs [<xref ref-type="bibr" rid="ref34">34</xref>]. The standard follow-up of RCTs eases the representation of IPD as vectors to use classical statistical models. GANs adapted to tabular data or feature-based machine learning models have been used, with decision trees yielding the best performance [<xref ref-type="bibr" rid="ref28">28</xref>,<xref ref-type="bibr" rid="ref35">35</xref>]. Previous works focused primarily on oncology and assessed the utility by replicability of the primary end point and did not assess privacy [<xref ref-type="bibr" rid="ref26">26</xref>,<xref ref-type="bibr" rid="ref28">28</xref>].</p><p>The replicability of the reference RCT may be assessed through the fidelity of the data point distributions and the analytical utility as the replication of the study results: estimate agreement, CI overlap, decision agreement, or standardized difference [<xref ref-type="bibr" rid="ref26">26</xref>]. The preservation of some predictive capacity is generally part of the utility assessment of synthetic data generated from real-world data [<xref ref-type="bibr" rid="ref36">36</xref>]. The generators developed from real-world data so far have claimed privacy through some privacy assessment at the model evaluation step. This assessment may rely on the risk of membership disclosure [<xref ref-type="bibr" rid="ref21">21</xref>] or the accuracy of an adversarial algorithm to discriminate real from synthetic data [<xref ref-type="bibr" rid="ref17">17</xref>]. The previous works generating synthetic RCT data have not assessed privacy.</p><p>A synthetic data generator called the &#x201C;avatars&#x201D; technique has recently been reported with a privacy-by-design approach [<xref ref-type="bibr" rid="ref37">37</xref>]. Unlike generative artificial intelligence models, it has been primarily designed as an anonymization technique with explicit privacy assessment. The initial report showed that synthetic datasets could be generated with high privacy metrics while outperforming Conditional Transformation-Generative Adversarial Network (CT-GAN) [<xref ref-type="bibr" rid="ref38">38</xref>] and Synthpop [<xref ref-type="bibr" rid="ref39">39</xref>] in replicating the primary end point analyses of an RCT and a cohort study. However, to become effective proxies of sensitive IPD, synthetic data must demonstrate a wider utility than merely replication of the main analysis of a reference dataset.</p></sec><sec id="s1-3"><title>Objective</title><p>In this study, we generated 2 synthetic RCT datasets in MS from the CLARITY and ADVANCE phase 3 trials using the avatars technique. MS is the most frequent chronic autoimmune disease of the central nervous system, progressively impairing multiple neurological functions. The main course is marked by relapsing episodes of disabling symptoms, associated with the accumulation of demyelinating lesions assessed by T2-weighted magnetic resonance imaging (MRI) and gadolinium enhancement. The classical efficacy end points of RCTs evaluating disease-modifying treatments are the annualized relapse rate (ARR), rate of T2 and gadolinium-enhancing (GdE) lesions, and confirmed disability worsening (CDW). The 3- or 6-month confirmation of the latter aims to rule out reversible relapse-associated symptoms. MS activity can be decreased by treatments commonly referred to as &#x201C;disease-modifying treatments.&#x201D;</p><p>Here, we determined to what extent this privacy-by-design technique can generate anonymous virtual patient datasets that capture most of the information reported in RCT publications, including primary and secondary efficacy end points, as well as safety. This work enabled the release of the placebo arms of both synthetic datasets as open data with approval of the relevant stakeholders, thus demonstrating the potential of synthetic data for information sharing in medicine.</p></sec></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Reference Datasets</title><p>We used two independent phase 3 RCTs in MS as reference datasets: CLARITY from Merck (NCT00213135) [<xref ref-type="bibr" rid="ref40">40</xref>] and ADVANCE from Biogen (NCT00906399) [<xref ref-type="bibr" rid="ref41">41</xref>] (<xref ref-type="fig" rid="figure1">Figure 1</xref>). These trials were large-scale international studies whose primary end points provided regulatory evidence to approve 2 disease-modifying treatments for MS on the market: cladribine and peginterferon beta (Peg-IFN&#x03B2;), respectively. CLARITY enrolled 1326 patients to test 2 regimens of cladribine versus placebo, and ADVANCE enrolled 1516 patients to test 2 regimens of Peg-IFN&#x03B2; versus placebo. Both studies included patients without disease-modifying treatment for at least 3 months and lasted 2 years. The data were transferred after privacy-enhancement processes by both companies. For each RCT, we integrated the data into a single analysis-ready table (<xref ref-type="fig" rid="figure1">Figure 1</xref>). The variables were selected to replicate the graphical elements reported in the publications (ie, tables and flowcharts) as much as the transferred data enabled us to do. The primary efficacy end points that yielded the overall conclusion of the studies were the relapse activity. The secondary efficacy end points were the T2 and GdE MRI activity and CDW. Efficacy and safety data regarding adverse events (AEs) were available for 2 of the 3 arms of CLARITY: the placebo and the approved regimen (865 patients). We used CLARITY to assess whether synthetic datasets could capture the information on both efficacy and safety end points in the case of a classical parallel 2-arm design. We used ADVANCE to test the robustness of the technique for more complex study designs because the 3 arms were available, and patients in the placebo arm were rerandomized after 1 year to one of the 2 Peg-IFN&#x03B2; regimens for the second year. However, only efficacy data were available.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Reference datasets and pipeline of synthetic dataset generation and assessment. (A) Reference datasets were partially transferred as multiple tables and (B) the pipeline integrated the reference data into a single analysis-ready table for each RCT. To respect constraints between some variables, those with deterministic relations were removed, yielding minimal datasets for synthetic data generation. Several synthetic datasets were then generated with various parameter configurations. For every generated dataset, fidelity was assessed by comparing the minimal versions of reference and synthetic datasets, and utility was assessed by replicating the RCT analysis on the postprocessed versions. One dataset per RCT was selected based on the best privacy-utility trade-off, as described in the main text. PEG: pegylated; RCT: randomized clinical trial.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v27i1e71297_fig01.png"/></fig></sec><sec id="s2-2"><title>Synthetic Data Generation</title><p>The avatars technique was described in detail in its initial report [<xref ref-type="bibr" rid="ref37">37</xref>]. Briefly, it generates synthetic data points using a multidimensional reduction and nearest neighbors algorithm. For each reference data point, the algorithm creates a local probability density model based on the topography of the nearest neighbors in the latent space of a factor analysis of mixed data (FAMD). A synthetic data point, called an &#x201C;avatar,&#x201D; is randomly sampled from the local model. In addition to standard privacy metrics, the 1:1 linkage of each avatar with its reference data point enables the assessment of the protection against membership inference attacks. The technique is proprietary and implemented in a client-server architecture (Octopize Mimetik). To help the technique respect the constraints between variables, we discarded variables with deterministic relations from the integrated analysis-ready table (eg, sum of 2 variables; <xref ref-type="fig" rid="figure1">Figure 1</xref>). The minimal dataset of CLARITY had 864 individual observations and 35 variables (7 categorical and 28 quantitative; Table S1 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>), and the minimal dataset of ADVANCE had 1512 individual observations and 25 variables (8 categorical and 17 quantitative; Table S2 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). Each observation yields a reference data point. For quantitative variables, missing values were handled by default as &#x201C;missing at random.&#x201D; For categorical variables, we handled them as &#x201C;missing not at random,&#x201D; because they were related to study design and patient disposition. The Avatars server automatically imputes missing values with a k-nearest neighbors algorithm. We used the Python (version 0.7.2; Python Software Foundation) client of the avatars. Analogous to hyperparameter tuning in predictive model development, we tested different values of the following parameters to identify the configuration yielding the best compromise between privacy and utility:</p><list list-type="order"><list-item><p>k: the number of neighbors to create the local probabilistic model,</p></list-item><list-item><p>ncp: the number of projection components to compute the Euclidean distances of the neighbors, and</p></list-item><list-item><p>variable weights to favor a subset of variables during multidimensional reduction.</p></list-item></list><p>The tested values for k were 2, 5, 10, 15, 20, 25, 30, 40, 50, 75, 100, and 150, and the values tested for ncp were 5, 10, 20, 30, 46, and the maximum possible value. In our use case, ncp could be set up to 61 for CLARITY and up to 65 for ADVANCE, which is higher than the number of variables in the minimal datasets since categorical variables are automatically one-hot encoded by the Avatars server. The weighting of the variables was explored by preliminary generations to identify 2 relevant configurations per RCT in addition to the unweighted configuration. Alternative encodings of some variables were also tested, such as the encoding of relapse counts as categories (0, 1, 2, and 3 or more) and AEs count as Booleans (none vs any), and the handling of missing quantitative values as aberrant negative values instead of leaving them to be imputed by the Avatars server. Five synthetic datasets per configuration were generated with different random states for sampling avatars from the local probability density models. We used this random state as another hyperparameter. All generated datasets were analyzed separately (ie, no pooling). Finally, we removed patient identifiers and shuffled the rows of the selected synthetic datasets before release.</p></sec><sec id="s2-3"><title>Fidelity Assessment</title><p>Fidelity assessed the similarity of the synthetic dataset to the reference dataset regardless of its intended use: the similarity of univariate, bivariate, and multivariate distributions. All analyses were performed in R (version 4.2.3; The R Foundation). For univariate distributions, the Avatars server returned the mean of the Hellinger distances at the dataset level. Bivariate distributions of numeric variables were analyzed with the matrices of Pearson correlation coefficients returned by the Avatars server. Multivariate distributions were compared based on unweighted FAMD maps using the FactoMineR package (version 2.9 [<xref ref-type="bibr" rid="ref42">42</xref>]) after multiple imputations with the MICE package (version 3.16.0 [<xref ref-type="bibr" rid="ref43">43</xref>]). Weighted FAMD maps were also returned by the Avatars server using a dedicated Python algorithm developed by the software editor called SAIPH (Octopize Mimethik [<xref ref-type="bibr" rid="ref44">44</xref>]).</p></sec><sec id="s2-4"><title>Utility Assessment</title><p>In this study, the utility assessed the similarity of the results obtained when replicating the analysis of interest on the synthetic dataset compared to those reported in the publications. For CLARITY, we also tested the replication of some post hoc subgroup analyses that proved critical for the market approval of cladribine [<xref ref-type="bibr" rid="ref45">45</xref>]. We used R base functions, MASS [<xref ref-type="bibr" rid="ref46">46</xref>], and the Survival packages (versions 7.3&#x2010;60 and 3.5&#x2010;7; Terry M Therneau) to replicate the statistical analysis based on the reported methods in the publications. For all end points, we considered the analysis to be replicated if (1) the estimate inferred from the synthetic dataset was within the 95% CIs reported in the publication, (2) the direction of the statistical effect was the same, and (3) the conclusion of the statistical test was the same (ie, whether the significance of the <italic>P</italic> value was &#x003C;.05 or not). We estimated the 95% CIs of adjusted ARRs by nonparametric bootstrap with 1000 replications, using the Boot package (version 1.3&#x2010;28.1 [<xref ref-type="bibr" rid="ref47">47</xref>]).</p></sec><sec id="s2-5"><title>Predictive Capacity</title><p>We also assessed the utility of the synthetic datasets for an alternative downstream task: the binary classification of patients who will have some MS activity during the study or not, either as clinical relapses or new MRI lesions. This predictive analysis included only complete cases. For ADVANCE, we have 1-year end points as targets because of the rerandomization of treatments for the placebo arm. Using the scikit-learn Python library (version 1.6.1 [<xref ref-type="bibr" rid="ref48">48</xref>]), we trained and evaluated a random forest binary classifier for each end point (train-test split of 70%&#x2010;30%). We designated the &#x201C;reference model&#x201D; as the one developed on the reference dataset and the &#x201C;test model&#x201D; as the one developed on synthetic datasets (default or optimized configurations). We assessed the predictive performance through the area under the ROC curve and the accuracy. Their 95% CIs were estimated through bootstrapping (1000 resamplings). We also evaluated the generalizability of the performances of the reference and test models on the other dataset version, namely the synthetic and reference datasets, respectively.</p></sec><sec id="s2-6"><title>Privacy Assessment</title><p>Privacy was assessed by the privacy metrics returned by the Avatars server. They are defined briefly in <xref ref-type="table" rid="table1">Table 1</xref>, [<xref ref-type="bibr" rid="ref49">49</xref>], and in detail on Octopize&#x2019;s website [<xref ref-type="bibr" rid="ref49">49</xref>]. The hidden rate (HR) is specific to the avatars technique and measures the risk of membership inference attacks [<xref ref-type="bibr" rid="ref21">21</xref>]. It is computed from the local cloaking (LC) whose development has been detailed in the report of the avatar technique [<xref ref-type="bibr" rid="ref37">37</xref>]. Briefly, for each patient, the LC counts the number of avatars that are more similar to his or her reference data point than his or her own avatar. An LC &#x2265;1 means that a distance-based matching would be erroneous for this patient. This scenario is extreme because the attacker should know all the variables of the patient. In our case, the scenario would be that an attacker with access to the synthetic dataset attempts to assess whether the patient was enrolled in the RCT and thus infer his or her diagnosis of MS. At the dataset level, privacy is summarized by the median LC and the HR, which is the proportion of patients with an LC of &#x2265;1. The software editor provides indicative targets for each metric (<xref ref-type="table" rid="table1">Table 1</xref>). In this study, we considered a median LC of 2 and an HR above 80% to be satisfactory.</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Privacy metrics of the selected datasets generated with optimized parameters. Metrics are grouped according to the conceptual anonymization criteria postulated by the European Data Protection Board. Detailed metric definitions are available on the software editor&#x2019;s website. All distances are Euclidean.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Anonymization criteria and metric</td><td align="left" valign="bottom">Definition</td><td align="left" valign="bottom">Software editor recommendation (indicative)</td><td align="left" valign="bottom">CLARITY<break/>(optimized parameter)</td><td align="left" valign="bottom">ADVANCE<break/>(optimized parameter)</td></tr></thead><tbody><tr><td align="left" valign="top" colspan="5">Singling out</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Distance to the closest</td><td align="left" valign="top">Median distance between each synthetic data point and its closest reference data point</td><td align="char" char="." valign="top">&#x003E;0.2</td><td align="char" char="." valign="top">0.31</td><td align="char" char="." valign="top">0.30</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Distance to the closest ratio</td><td align="left" valign="top">Median of the ratio of distances between each synthetic data point and its closest and second-closest reference data points</td><td align="left" valign="top">&#x003E;0.3</td><td align="left" valign="top">0.81</td><td align="left" valign="top">0.60</td></tr><tr><td align="left" valign="top" colspan="5">Linkability, %</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Column direct match protection</td><td align="left" valign="top">Minimum probability that a variable could be used as a direct identifier</td><td align="left" valign="top">&#x003E;50</td><td align="left" valign="top">84.8</td><td align="left" valign="top">90.9</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Row direct match protection</td><td align="left" valign="top">Percentage of synthetic data points that are identical to reference data points</td><td align="left" valign="top">&#x003E;90</td><td align="left" valign="top">100</td><td align="left" valign="top">100</td></tr><tr><td align="left" valign="top" colspan="5">Inference</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x2003;Median local cloaking</named-content></td><td align="left" valign="top">Median number of avatars more similar to the reference data point of a patient than its own avatar</td><td align="char" char="." valign="top">&#x003E;5</td><td align="char" char="." valign="top">3</td><td align="char" char="." valign="top">6</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Hidden rate, %</td><td align="left" valign="top">Probability of erroneous distance-based matching</td><td align="left" valign="top">&#x003E;90</td><td align="left" valign="top">85.0</td><td align="left" valign="top">93.2</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Categorical hidden rate, %</td><td align="left" valign="top">Probability of erroneous distance-based matching based on categorical variables only</td><td align="left" valign="top">&#x003E;90</td><td align="left" valign="top">98.4</td><td align="left" valign="top">98.0</td></tr></tbody></table></table-wrap></sec><sec id="s2-7"><title>Dataset Selection</title><p>From both RCTs, we selected the synthetic dataset that replicated the primary and secondary efficacy end points best while having a satisfactory level of privacy. The utilities of the datasets replicating all reported statistical test conclusions were inspected individually. In cases of equivalent utilities, the dataset with the highest privacy was preferred. In cases where no dataset replicated all the end points, the replications of the noncommercial arm end points were neglected. If still insufficient, the replication of the primary absolute and relative end points (ie, the relapse activity) was prioritized, followed by T2 MRI activity, CDW, and finally GdE MRI activity, with priority given to relative over absolute secondary end points.</p></sec><sec id="s2-8"><title>Data and Code Availability</title><p>The reference datasets may be shared upon request from Merck and Biogen. The placebo arms of the 2 selected synthetic datasets have been made publicly available as open access on the Figshare platform [<xref ref-type="bibr" rid="ref50">50</xref>] with the approval of Merck and Biogen, although these approvals were not strictly necessary from a regulatory point of view. The code is available as R and Python notebooks at GitLab [<xref ref-type="bibr" rid="ref51">51</xref>]. Multitable simulated versions have been rebuilt according to the original CDISC formats for educational purposes.</p></sec><sec id="s2-9"><title>Ethical Considerations</title><p>The research was conducted under the consortium agreement of the ANR-21-RHUS-0014 PRIMUS project and the MR004 data processing regulation framework of the French Personal Data Regulatory Commission (Commission nationale de l&#x2019;informatique et des libert&#x00E9;s; CNIL). It was approved by the institutional review board of Nantes University (reference 09072024). According to French law, this study was covered by the written consent for the primary studies and the information for further research use. The deidentified datasets were transferred after privacy-enhancement processes by Merck and Biogen.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>Robust Utility for the Primary End Points</title><p>We generated 2160 synthetic datasets with varying parameter configurations, half using CLARITY and half using ADVANCE as reference datasets (<xref ref-type="fig" rid="figure1">Figure 1</xref>). Despite the complexity of the ADVANCE study design, only a few individual observations had to be postprocessed in some datasets for the study design to remain consistent. The missing data patterns due to attrition were well replicated, although the number of patients per arm was not necessarily as balanced as after true randomization (Figures S4 and S5 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). The primary end point estimates were robustly replicated across the different configurations (<xref ref-type="fig" rid="figure2">Figure 2</xref>). The estimate of CLARITY was within the reported 95% CI in 783 of the 1080 datasets (72.5%), always with significant <italic>P</italic> values. The estimate of ADVANCE was within the reported 95% CI in 876 of the 1080 datasets (81.1%), with 873 (80.8%) of them having a significant <italic>P</italic> value.</p><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>Robustness of the primary end point replications and privacy. Each point represents a generated synthetic dataset (1080 datasets per RCT). Privacy is expressed by the hidden rate, which reflects the probability of failure of a distance-based membership inference attack. The reported estimations of the primary end points are plotted with their 95% CIs (horizontal lines and gray areas). Among the 1080 generated datasets, 813 (75.3%) were within the reported 95% CI for CLARITY and 871 (80.6%) for ADVANCE. Higher privacy tended to lower the inferred treatment effect, likely reflecting the loss of statistical signal between the trial arms. The 2 selected datasets with optimized parameters are highlighted, as are the 2 generated with default configurations. AE: adverse event; CDW: confirmed disability worsening; ncp<italic>:</italic> number of principal components; Peg-IFN&#x03B2; /2w: peginterferon beta 1 dose every 2 weeks; RCT: randomized clinical trial.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v27i1e71297_fig02.png"/></fig></sec><sec id="s3-2"><title>Robust Privacy</title><p>Most of the 2160 generated datasets had privacy metrics passing the software editor&#x2019;s recommendations (<xref ref-type="fig" rid="figure3">Figure 3</xref>; <xref ref-type="table" rid="table1">Tables 1</xref> and <xref ref-type="table" rid="table2">2</xref> for the numerical values). Only 4 had one avatar that was, by chance, identical to a reference data point (ie, row direct match). The distance of the avatars to the closest reference data point assesses the dispersion of the synthetic data points relative to the set of reference data points: the higher, the better the privacy. It was above 0.2 for all of the generated datasets, which is the recommended threshold by the avatars software editor (<xref ref-type="table" rid="table1">Table 1</xref>). The HR, the categorical HR, and the mean of Hellinger distances were the metrics most difficult to pass the recommended thresholds. We focused the rest of the report on HR. All 2160 generations had an HR above 80% (<xref ref-type="fig" rid="figure2">Figure 2</xref>). The HR increased in the postprocessed datasets whose privacies were assessed with the default encoding of all variables and unweighted FAMD projections (not shown). Overall, this shows the robustness of the avatars technique regarding privacy.</p><fig position="float" id="figure3"><label>Figure 3.</label><caption><p>Privacy and fidelity metrics distributions of all generated datasets (1080 per randomized clinical trial). The boxes show the quartiles and the median of the values (whiskers represent quartiles &#x00B1;1.5&#x00D7;IQR). The recommended thresholds by the software editor are plotted as red lines. Privacy metrics were expected to be above the threshold (first 7 metrics), and fidelity metrics were expected to be below the threshold (last 2 metrics). For readability, we scaled the values of some metrics and plotted those expressed in percentages as proportions. The median local cloaking is capped by the software above 50. The hidden rate, the categorical hidden rate, and mean of Hellinger distances were the metrics most difficult to pass relative to the recommended thresholds. RCT: randomized clinical trial.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v27i1e71297_fig03.png"/></fig><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Fidelity metrics of the selected datasets generated with optimized parameters.</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Fidelity metric</td><td align="left" valign="bottom">Definition</td><td align="left" valign="bottom">Recommended target by Octopize (indicative)</td><td align="left" valign="bottom">CLARITY<break/>(optimized parameters)</td><td align="left" valign="bottom">ADVANCE<break/>(optimized parameters)</td></tr></thead><tbody><tr><td align="left" valign="top">Mean of Hellinger distances</td><td align="left" valign="top">Mean of the Hellinger distances of each variable</td><td align="left" valign="top">&#x003C;0.10</td><td align="left" valign="top">0.10</td><td align="left" valign="top">0.09</td></tr><tr><td align="left" valign="top">Correlation difference ratio, %</td><td align="left" valign="top">Average of the absolute variations of Pearson correlations</td><td align="left" valign="top">&#x003C;10</td><td align="left" valign="top">2.52</td><td align="left" valign="top">1.49</td></tr></tbody></table></table-wrap></sec><sec id="s3-3"><title>Synthetic Dataset Selection in the Context of a Privacy-Fidelity Trade-Off</title><p>The assessment of privacy, fidelity, and utility showed a privacy-fidelity trade-off (<xref ref-type="fig" rid="figure4">Figure 4</xref>). We assessed fidelity with the mean of the Hellinger distances between the univariate distributions. Small k values increased fidelity while decreasing privacy. A small ncp value increased fidelity with few effects on privacy. Weighting and encoding some variables differently could optimize the trade-off, as reflected by the generation of datasets closer to the &#x201C;sweet spot&#x201D; with both high fidelity and privacy. A better fidelity did not automatically improve utility. For CLARITY, four datasets (0.4%) replicated all primary and secondary efficacy end points. For ADVANCE, no dataset replicated all primary and secondary efficacy end points for the 2 tested regimens, but 14 did when neglecting the noncommercial regimen (1.3%). For CLARITY, we selected the dataset with the best replication of absolute estimates, generated with k=5, ncp=5, weighting of the study arm by 20, and encoding of relapse counts as categories (0, 1, 2, and 3 or more) and AE counts as Booleans (none vs any). Such encoding was reverted at postprocessing before replicating the RCT analysis, but yielded some granularity loss. For ADVANCE, we selected the dataset generated with k=2, ncp=10, weighting of the study arm by 20, relapse counts and CDW delays by 2, and missing quantitative values encoded as aberrant negative values. The selected dataset from CLARITY had a median LC of 3 and an HR of 85.0%; the one from ADVANCE had a median LC of 6 and an HR of 93.2% (<xref ref-type="table" rid="table1">Table 1</xref>). We focus the rest of the report on both selected datasets (referred to as &#x201C;optimized&#x201D;) and 2 datasets generated with default parameters (k=10; ncp=10; not weighted) and the third random state.</p><fig position="float" id="figure4"><label>Figure 4.</label><caption><p>Privacy-fidelity trade-off. Each point represents the average metrics of the 5 generations with a given parameter configuration but different random states (216 groups per RCT). Privacy is expressed by the hidden rate, assessing the probability of failure of a distance-based membership inference attack. Fidelity is expressed as the mean of the Hellinger distances between the univariate distributions. Weighting and encoding some variables differently could optimize the trade-off, as reflected by the generation of datasets closer to the &#x201C;sweet spot&#x201D; in the upper right corner (high privacy and high fidelity). The 2 selected datasets are highlighted, as are the 2 generated with default configurations. Greater fidelity did not automatically improve utility, as reflected by the position of the selected datasets. AE: adverse event; CDW: confirmed disability worsening; ncp: number of principal components; RCT: randomized clinical trial.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v27i1e71297_fig04.png"/></fig></sec><sec id="s3-4"><title>Good Fidelity at the Population Level Despite Alterations in Variable Distributions</title><p>The mean of Hellinger distances was 0.10 and 0.09 for the selected datasets from CLARITY and ADVANCE, respectively (<xref ref-type="table" rid="table2">Table 2</xref>). The effects of the avatar method on variable distributions were consistent across all generated datasets, modulated only by different parameter configurations (<xref ref-type="fig" rid="figure5">Figure 5</xref>). The distributions of categorical variables were the most preserved, with a tendency to amplify class imbalances. The distributions of quantitative variables tended to be narrowed and normalized, but their means were similar if they had a limited skewness. Of note, many distributions, especially MRI lesion counts, were skewed, with 0 being the majority value and many outliers on the right tail. As a result of the privacy-by-design approach, the avatars of the outliers were drastically recentered toward high-density regions in the synthetic dataset, as shown by the weighted FAMD projections (<xref ref-type="fig" rid="figure6">Figure 6</xref>), which tended to decrease the average absolute counts. The most affected variable was the count of GdE lesions at 2 years in ADVANCE. Its average was reduced by about a factor of 3 in the default dataset (0.47 to 0.14), which could be mitigated with the optimized configuration. Bivariate distributions were similar (Figure S2 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>), as were the missing data patterns (Figure S3 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>).</p><fig position="float" id="figure5"><label>Figure 5.</label><caption><p>Impairment of univariate distributions. Comparisons of illustrative univariate distributions for the default and selected datasets generated from ADVANCE (top and bottom panels, respectively). Means are plotted as dashed lines. The avatars technique altered the distributions to varying degrees depending on the type of variable. The largest effects were observed for count distributions. CDW: confirmed disability worsening; FAMD: factorial analysis of mixed data; GdE: gadolinium-enhancing; N/E: new or enlarging.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v27i1e71297_fig05.png"/></fig><fig position="float" id="figure6"><label>Figure 6.</label><caption><p>Comparison of the factor analysis of mixed data projections of the selected datasets with or without applying the weights of the respective parameter configuration. As a result of the primary design of the avatars as an anonymization technique, the avatar data points clustered in higher-density regions, which is less reidentifiable. FAMD: factor analysis of mixed data.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v27i1e71297_fig06.png"/></fig></sec><sec id="s3-5"><title>The Utility for Multiple End Points Needs Optimization</title><p>While most generations replicated the primary end point of the respective RCT, replicating all secondary end points was more challenging (<xref ref-type="fig" rid="figure7">Figures 7</xref> and <xref ref-type="fig" rid="figure8">8</xref>). Generations with default parameters replicated most relative end points but tended to shift absolute end points due to the amplification of class imbalance by the avatars technique, increasing the percentages of the most represented classes and decreasing those of the minority classes. ARR and lesion rates were highly sensitive to the average shift of count variables. This limitation could be mitigated by optimizing the parameters, especially the weighting and encoding of some variables. The replications of the flowcharts and tables of both RCT reports are presented in Figures S4 and S5, Tables S3-S5 and S6&#x2010;S7 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p><fig position="float" id="figure7"><label>Figure 7.</label><caption><p>Utility assessment of the default and selected datasets from CLARITY. All end points were analyzed over 2 years and against placebo. The analyses were adjusted for covariates as reported. The <italic>P</italic> values of the subgroup analysis end points correspond to the interaction tests. Only the most important types of adverse events are displayed. The optimized configuration mitigated the limitations of the avatars technique observed with the default configuration, especially for the replication of absolute rates and safety outcomes. AE: adverse event; ARR: annualized relapse rate; CDW: confirmed disability worsening; CUA: combined unique active; GdE: gadolinium-enhancing; HR: hazard ratio; HRA: high relapse activity (ie, 2 relapses or more during the year preceding the study baseline); OR: odds ratio; SAE: severe adverse event.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v27i1e71297_fig07.png"/></fig><fig position="float" id="figure8"><label>Figure 8.</label><caption><p>Utility assessment of the default and selected datasets from ADVANCE. All end points were analyzed over 2 years and against placebo unless specified (eg, year 1 and year 2). The analyses were adjusted for covariates as reported. All primary end points and all relative secondary end points were successfully replicated for the commercial &#x201C;1 dose per 2 weeks&#x201D; regimen. The optimized configuration mitigated the limitations of the avatars technique observed with the default configuration, especially for replication of absolute rates. ARR: annualized relapse rate; CDW: confirmed disability worsening; GdE: gadolinium-enhancing; HR: hazard ratio; N/E T2 lesions: new or enlarging T2 lesions; Peg-IFNb/2w: peginterferon beta-1 dose every 2 weeks; Peg-IFNb/4w: peginterferon beta-1 dose every 4 weeks.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v27i1e71297_fig08.png"/></fig><p>For CLARITY (<xref ref-type="fig" rid="figure7">Figure 7</xref>), we pushed the assessment of specific utility up to the replication of interaction tests in a post hoc subgroup analysis in patients with high relapse activity (ie, 2 or more relapses the year before the study baseline) [<xref ref-type="bibr" rid="ref45">45</xref>]. The alteration in univariate distributions by the avatars method suggested that subgroup analyses would be harder to replicate, but the selected dataset managed to do so. These post hoc subgroup analyses were critical for the market approval of cladribine in this subpopulation, as the initial submission for the whole relapsing-remitting MS population had been withdrawn due to safety concerns about the risk of neoplasm (6 vs 0 patients in the real dataset). The safety end points were very sensitive to the skewness of count distributions, such that the proportions of patients with serious AEs were drastically reduced in the default dataset. Encoding AEs as Booleans mitigated this and also replicated the contrast of neoplasm incidence (5 avatars with cladribine vs 0 with placebo). The replication of the RCT report tables is provided in Tables S3-S5 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p><p>For ADVANCE (<xref ref-type="fig" rid="figure8">Figure 8</xref>), the complex design aimed to compare MS activity during the second year against the first year of treatment to assess the run-in (ie, delay of action) of Peg-IFN&#x03B2;. Indeed, the selected dataset and the one generated with default parameters replicated the decrease of the ARR during year 2 with the &#x201C;1 dose per 2 weeks&#x201D; regimen, while only the optimized dataset replicated the stability of the ARR with the &#x201C;1 dose per 4 weeks&#x201D; regimen. In the selected dataset, the only end point that could not be replicated was the 12-week CDW hazard ratio estimate between both tested regimens and the 24-week CDW hazard ratio estimate for the noncommercial regimen. The first was outside the reported 95% CI with a <italic>P</italic> value that became significant, while the second was in the wrong direction. The replicability of the absolute GdE lesion count was poor, whatever the configuration. This limitation was likely associated with the skewness of this variable distribution, which was essentially composed of outliers (<xref ref-type="fig" rid="figure5">Figure 5</xref>). The replication of the RCT report tables is provided in Tables S6 and S7 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p></sec><sec id="s3-6"><title>The Utility for Other Downstream Tasks is Not Guaranteed</title><p>Performing prediction tasks on synthetic data yielded better performances than when performed on reference data (<xref ref-type="table" rid="table3">Tables 3</xref> and <xref ref-type="table" rid="table4">4</xref>). The higher the performance of the reference model, the more important the increase in performance of the test model. This indicated a simplification of the data patterns in the synthetic datasets, which is consistent with the normalization of univariate distributions and the decrease of outliers (<xref ref-type="fig" rid="figure5">Figure 5</xref>). This interpretation was reinforced by the similar or better performances of the reference models when evaluated on the synthetic datasets. Likewise, we controlled for overfitting on unrealistic patterns in the synthetic datasets by evaluating the test models on the reference datasets. The test models had similar or better performances on the reference datasets than the reference models, which suggested a regularizing effect of the synthetic data. Yet, the difference in prediction performances could also result from the amplification of class imbalance. Overall, these necessitate that the utility assessment of synthetic datasets prioritize the end points, as their replicabilities are uneven and may be conditioned by the characteristics of the reference dataset. The synthetic data generation may be optimized toward a given purpose by weighting some variables or encoding them differently.</p><table-wrap id="t3" position="float"><label>Table 3.</label><caption><p>Predictive capacity of the datasets generated from CLARITY.</p></caption><table id="table3" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Experiment and metric</td><td align="left" valign="bottom">Reference data, estimate (95% CI)</td><td align="left" valign="bottom">Synthetic data (default), estimate (95% CI)</td><td align="left" valign="bottom">Synthetic data (optimized), estimate (95% CI)</td></tr></thead><tbody><tr><td align="left" valign="top" colspan="4">Reference model on reference data and test model on synthetic data</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Relapse activity over 2 years</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>AUC<sup><xref ref-type="table-fn" rid="table3fn1">a</xref></sup></td><td align="left" valign="top">0.57 (0.48-0.66)</td><td align="left" valign="top">0.76 (0.70-0.82)<sup><xref ref-type="table-fn" rid="table3fn2">b</xref></sup></td><td align="left" valign="top">0.75 (0.67-0.82)<sup><xref ref-type="table-fn" rid="table3fn2">b</xref></sup></td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Accuracy</td><td align="left" valign="top">0.74 (0.69-0.80)</td><td align="left" valign="top">0.77 (0.72-0.82)</td><td align="left" valign="top">0.75 (0.69-0.80)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>MRI<sup><xref ref-type="table-fn" rid="table3fn3">c</xref></sup> activity over 2 years</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>AUC</td><td align="left" valign="top">0.76 (0.69-0.82)</td><td align="left" valign="top">0.80 (0.73-0.87)</td><td align="left" valign="top">0.89 (0.84-0.93)<sup><xref ref-type="table-fn" rid="table3fn2">b</xref></sup></td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Accuracy</td><td align="left" valign="top">0.75 (0.69-0.80)</td><td align="left" valign="top">0.82 (0.77-0.86)<sup><xref ref-type="table-fn" rid="table3fn2">b</xref></sup></td><td align="left" valign="top">0.87 (0.83-0.91)<sup><xref ref-type="table-fn" rid="table3fn2">b</xref></sup></td></tr><tr><td align="left" valign="top" colspan="4">Reference model on synthetic data</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Relapse activity over 2 years</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>AUC</td><td align="char" char="." valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table3fn4">d</xref></sup></td><td align="char" char="." valign="top">0.76 (0.69-0.83)<sup><xref ref-type="table-fn" rid="table3fn2">b</xref></sup></td><td align="char" char="." valign="top">0.71 (0.63-0.78)<sup><xref ref-type="table-fn" rid="table3fn2">b</xref></sup></td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Accuracy</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">0.76 (0.71-0.82)</td><td align="left" valign="top">0.73 (0.67-0.78)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>MRI activity over 2 years</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>AUC</td><td align="left" valign="top">&#x2014;</td><td align="char" char="." valign="top">0.88 (0.82-0.93)<sup><xref ref-type="table-fn" rid="table3fn2">b</xref></sup></td><td align="char" char="." valign="top">0.88 (0.82-0.93)<sup><xref ref-type="table-fn" rid="table3fn2">b</xref></sup></td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Accuracy</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">0.85 (0.81-0.89)<sup><xref ref-type="table-fn" rid="table3fn2">b</xref></sup></td><td align="left" valign="top">0.83 (0.78-0.87)<sup><xref ref-type="table-fn" rid="table3fn2">b</xref></sup></td></tr><tr><td align="left" valign="top" colspan="4">Test model on reference data</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Relapse activity over 2 years</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>AUC</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">0.73 (0.66-0.81)<sup><xref ref-type="table-fn" rid="table3fn2">b</xref></sup></td><td align="left" valign="top">0.70 (0.61-0.78)<sup><xref ref-type="table-fn" rid="table3fn2">b</xref></sup></td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Accuracy</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">0.73 (0.67-0.79)</td><td align="left" valign="top">0.76 (0.71-0.82)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>MRI activity over 2 years</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>AUC</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">0.78 (0.71-0.84)</td><td align="left" valign="top">0.84 (0.78-0.89)<sup><xref ref-type="table-fn" rid="table3fn2">b</xref></sup></td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Accuracy</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">0.77 (0.72-0.82)</td><td align="left" valign="top">0.81 (0.75-0.85)<sup><xref ref-type="table-fn" rid="table3fn2">b</xref></sup></td></tr></tbody></table><table-wrap-foot><fn id="table3fn1"><p><sup>a</sup>AUC: area under the receiver operating characteristic curve.</p></fn><fn id="table3fn2"><p><sup>b</sup>Values outside the CI95% of the reference model performances</p></fn><fn id="table3fn3"><p><sup>c</sup>MRI: magnetic resonance imaging.</p></fn><fn id="table3fn4"><p><sup>d</sup>Not applicable.</p></fn></table-wrap-foot></table-wrap><table-wrap id="t4" position="float"><label>Table 4.</label><caption><p>Predictive capacity of the datasets generated from ADVANCE.</p></caption><table id="table4" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Experiment and metric</td><td align="left" valign="bottom">Reference data, estimate (95% CI)</td><td align="left" valign="bottom">Synthetic data (default), estimate (95% CI)</td><td align="left" valign="bottom">Synthetic data (optimized), estimate (95% CI)</td></tr></thead><tbody><tr><td align="left" valign="top" colspan="4">Reference model on reference data and test model on synthetic data</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Relapse activity over year 1</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content></named-content>AUC<sup><xref ref-type="table-fn" rid="table4fn1">a</xref></sup></td><td align="left" valign="top">0.60 (0.53-0.67)</td><td align="left" valign="top">0.78 (0.73-0.84)<sup><xref ref-type="table-fn" rid="table4fn2">b</xref></sup></td><td align="left" valign="top">0.77 (0.70-0.82)<sup><xref ref-type="table-fn" rid="table4fn2">b</xref></sup></td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Accuracy</td><td align="left" valign="top">0.79 (0.75-0.83)</td><td align="left" valign="top">0.84 (0.81-0.88)<sup><xref ref-type="table-fn" rid="table4fn2">b</xref></sup></td><td align="left" valign="top">0.82 (0.79-0.86)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>MRI<sup><xref ref-type="table-fn" rid="table4fn3">c</xref></sup> activity over year 1</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>AUC</td><td align="left" valign="top">0.78 (0.73-0.83)</td><td align="left" valign="top">0.87 (0.81-0.92)<sup><xref ref-type="table-fn" rid="table4fn2">b</xref></sup></td><td align="left" valign="top">0.89 (0.85-0.92)<sup><xref ref-type="table-fn" rid="table4fn2">b</xref></sup></td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Accuracy</td><td align="left" valign="top">0.79 (0.75-0.83)</td><td align="left" valign="top">0.90 (0.86-0.93)<sup><xref ref-type="table-fn" rid="table4fn2">b</xref></sup></td><td align="left" valign="top">0.86 (0.82-0.89)<sup><xref ref-type="table-fn" rid="table4fn2">b</xref></sup></td></tr><tr><td align="left" valign="top" colspan="4">Reference model on synthetic data</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Relapse activity over year 1</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>AUC</td><td align="left" valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table4fn4">d</xref></sup></td><td align="left" valign="top">0.79 (0.74-0.84)<sup><xref ref-type="table-fn" rid="table4fn2">b</xref></sup></td><td align="left" valign="top">0.87 (0.83-0.91)<sup><xref ref-type="table-fn" rid="table4fn2">b</xref></sup></td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Accuracy</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">0.80 (0.77-0.84)</td><td align="left" valign="top">0.83 (0.80-0.87)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>MRI activity over year 1</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>AUC</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">0.91 (0.88-0.95)<sup><xref ref-type="table-fn" rid="table4fn2">b</xref></sup></td><td align="left" valign="top">0.95 (0.93-0.97)<sup><xref ref-type="table-fn" rid="table4fn2">b</xref></sup></td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Accuracy</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">0.90 (0.86-0.93)<sup><xref ref-type="table-fn" rid="table4fn2">b</xref></sup></td><td align="left" valign="top">0.88 (0.85-0.91)<sup><xref ref-type="table-fn" rid="table4fn2">b</xref></sup></td></tr><tr><td align="left" valign="top" colspan="4">Test model on reference data</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Relapse activity over year 1</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>AUC</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">0.72 (0.65-0.78)<sup><xref ref-type="table-fn" rid="table4fn2">b</xref></sup></td><td align="left" valign="top">0.79 (0.73-0.84)<sup><xref ref-type="table-fn" rid="table4fn2">b</xref></sup></td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Accuracy</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">0.77 (0.73-0.81)</td><td align="left" valign="top">0.82 (0.79-0.86)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>MRI activity over year 1</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>AUC</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">0.80 (0.75-0.85)</td><td align="left" valign="top">0.88 (0.84-0.92)<sup><xref ref-type="table-fn" rid="table4fn2">b</xref></sup></td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Accuracy</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">0.79 (0.75-0.83)</td><td align="left" valign="top">0.85 (0.81-0.88)<sup><xref ref-type="table-fn" rid="table4fn2">b</xref></sup></td></tr></tbody></table><table-wrap-foot><fn id="table4fn1"><p><sup>a</sup>AUC: area under the ROC curve.</p></fn><fn id="table4fn2"><p><sup>b</sup>Values outside the CI95% of the reference model performances</p></fn><fn id="table4fn3"><p><sup>c</sup>MRI: magnetic resonance imaging.</p></fn><fn id="table4fn4"><p><sup>d</sup>Not applicable.</p></fn></table-wrap-foot></table-wrap></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Findings</title><p>While a report of the avatars technique already provided proof of concept that a synthetic dataset could reproduce the primary end point [<xref ref-type="bibr" rid="ref37">37</xref>], our study showed that it is possible to generate synthetic datasets replicating most absolute and relative end points reported in the publications while implementing the regulatory guidance about anonymization. The method proved robust for privacy and the replication of the primary end point, but finding a satisfactory utility required optimization. This optimization process is analogous to the development and selection of machine learning models after searching for the optimal algorithm family and hyperparameters. In our use case, the explicit privacy assessment allowed us to legally qualify the synthetic datasets as nonpersonal data and share them as open datasets. Satisfactory utility was even achieved with the complex study design of ADVANCE, which suggests the ability of the avatars technique to capture the information of a wide range of RCTs and complex datasets in other fields.</p></sec><sec id="s4-2"><title>Limitations</title><p>This study did not compare the avatars to some benchmark algorithms. The first report of the avatars method performed such an analysis against Synthpop and CT-GAN and showed that the avatars outperformed them in replicating the primary end point of an RCT and a cohort of real-world data [<xref ref-type="bibr" rid="ref37">37</xref>]. Although not performed on the same reference datasets, we considered this result sufficiently established to focus the efforts and the analysis of this study on the privacy and replication of the multiple end points an RCT may have.</p><p>It remains that the whole granularity of the reference datasets could not be captured, which would be a requirement to use the synthetic dataset as an external comparator. Such external comparison has been performed in the CHAMPION trial in neuromyelitis optica spectrum disorder, a rare and aggressive disease, to evaluate ravulizumab while avoiding exposing patients to a placebo [<xref ref-type="bibr" rid="ref7">7</xref>]. For educational purposes, we also provided a simulation of a rebuilt version of the synthetic datasets into the CDISC standards, as received by Merck and Biogen. Our results showed that encoding the reference data in a more aggregated fashion (relapse counts as 4-level categorical variables, AEs count as Booleans) improved the utility regarding the corresponding end points. This aggregation could have been pushed further at the cost of a narrowed intended use of the generated datasets. This and the partial data transfer by the industrials limited the granularity that could be captured by the avatars technique.</p><p>Since the avatars technique has been primarily developed as an anonymization technique, it tends to recenter the data points in the latent space and alter the univariate distributions because minoritarian profiles and outliers are easier to reidentify. This is likely to limit the use of the synthetic datasets for exploratory subgroup analysis in populations defined by several criteria or as external synthetic control arms, should significant subgroup matching with the real experimental arm be necessary. Furthermore, the fact that better fidelity did not automatically result in better utility highlights that the assessment of a synthetic dataset cannot be agnostic of the intended use. As such, post hoc analysis of synthetic datasets can only be hypothesis-generating.</p><p>As suggested by the alternative variable weighting and encodings, our results could be improved by complexifying the data preprocessing (eg, normalizing count data with log transforms) and the synthetic dataset generation (eg, one generation per study arm). In truth, the parameter space with alternative weighting, encoding of variables, and random states could not be explored exhaustively because of computational cost considerations.</p><p>The selected synthetic datasets had median LCs and HRs below the targets generally recommended by the software editor (<xref ref-type="table" rid="table1">Table 1</xref>). These targets are only indicative. No technical consensus exists about the required privacy metrics and their acceptable levels. In our specific use case, one has to take into account the combination of other privacy-enhancing processes such as deidentification, time shifting, exclusion of any medico-administrative variable to retain only specialized variables about MS (ie, data minimization), the aggregation of data into an integrated analysis-ready table, and the increase of HR after postprocessing. This suggests that the privacy-fidelity and privacy-utility trade-offs of synthetic data generation should be evaluated on a case-by-case basis.</p></sec><sec id="s4-3"><title>Perspectives</title><p>The privacy-fidelity trade-off highlighted by the 2160 datasets we generated (<xref ref-type="fig" rid="figure4">Figure 4</xref>) and the uneven utilities (<xref ref-type="fig" rid="figure7">Figures 7</xref> and <xref ref-type="fig" rid="figure8">8</xref>) are both a limit of the agnostic exploration of synthetic datasets and a perspective for usage control over the data value chain. Beyond the risk of patient reidentification from individual observations, the owner of a reference database may be concerned by the loss of control over the information of a dataset, should a synthetic dataset have a high and broad utility. According to the intended usage of the synthetic data, the generation may be parametrized or the dataset selected to favor utility or privacy and specific variables. As such, a dedicated study would be required to analyze the performance gain of the predictive models trained on synthetic data (<xref ref-type="table" rid="table3">Tables 3</xref> and <xref ref-type="table" rid="table4">4</xref>).</p><p>In contrast to the data-centric approach of synthetic data, the dominant trend in sharing information for medical research is to share calibrations of parametric models. Federated learning is the archetypal framework for developing deep learning models with sensitive data [<xref ref-type="bibr" rid="ref52">52</xref>]. Both approaches have been compared operationally, with significantly faster processes when sharing synthetic data [<xref ref-type="bibr" rid="ref53">53</xref>]. Still, even if the sensitive data is not shared, the privacy of the model learned from them remains questionable [<xref ref-type="bibr" rid="ref18">18</xref>-<xref ref-type="bibr" rid="ref20">20</xref>]. Therefore, both approaches could supplement one another, with federated learning enabling data owners to enforce their control rules, while synthetic data would address the privacy risk and augment datasets for a given use or context.</p></sec><sec id="s4-4"><title>Conclusion</title><p>We generated synthetic RCT datasets and selected 2 for release as open datasets with a satisfactory trade-off between privacy and utility. To the best of our knowledge, it is the first report of virtual trials replicating all reported efficacy end points for the placebo and approved regimen arms of several RCTs. The synthetic datasets may be used for various exploratory uses, but the information captured is insufficient for a complete indirect treatment comparison. The privacy-fidelity trade-off and the uneven utility show that synthetic data generation has to be purpose-driven, rather than agnostic of the intended use. Besides the privacy enhancement of synthetic datasets, their limited validity for unintended uses provides usage control to the owner of the reference data.</p></sec></sec></body><back><ack><p>We thank Nathalie Blanc and Joelle Martin-Gauthier for their support in project management. The authors confirm that there was no use of generative artificial intelligence (AI) technology in the generation of text, figures, or other informational content of this manuscript. This work is part of the PRIMUS project, which was supported in part by the French National Research Agency (Agence Nationale de la Recherche, ANR) as its third PIA, integrated into the France 2030 plan under reference ANR-21-RHUS-0014. We thank Biogen and Merck for providing the reference datasets and Octopize for the technical support.</p></ack><fn-group><fn fn-type="con"><p>SD contributed to conceptualization, data curation, formal analysis, investigation, methodology, and writing of the original draft. OR contributed to methodology, validation, and writing, including review and editing. IF and JP contributed to writing, including review and editing. JDS provided supervision and contributed to writing, including review and editing. BB, MP, MG, and AFB contributed resources, with MG and AFB also providing software, and all four participated in writing, including review and editing. DL and GE were responsible for funding acquisition, supervision, validation, and writing, including review and editing. PAG contributed to conceptualization, funding acquisition, methodology, supervision, validation, and writing, including review and editing.</p></fn><fn fn-type="conflict"><p>SD, OR, IF, JP, GE, and DL have no conflicts of interest to disclose. BB is an employee of Biogen and may own stock in the company. She was neither involved in the conception of the work nor in the analysis of the results. MP is an employee at Merck. She was neither involved in the conception of the work nor in the analysis of the results. MG and AFB are employees of Octopize. They were neither involved in the conception of the work nor in the analysis of the results. JDS has participated in advisory boards for Biogen, Merck and Juvis&#x00E9; Pharmaceuticals. DL has participated in advisory boards for Alexion, Merck, Novartis, and Roche in the last 3 years. PAG is the founder of Methodomics (2008) and the cofounder of Big Data Sant&#x00E9; (2018). He consults for major pharmaceutical companies and start-ups, all through academic pipelines (AstraZeneca, Biogen, Boston Scientific, Cook, Docaposte, Edimark, Ellipses, Elsevier, Janssen, IAGE, Lek, Methodomics, Merck, M&#x00E9;rieux, Octopize, Sanofi-Genzyme, Lifen, Aspire UAE). PAG is also a volunteer board member at AXA, a not-for-profit mutual insurance company (since 2021). He has no prescribing activity related to drugs or devices and receives no wages from these activities.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">AE</term><def><p>adverse event</p></def></def-item><def-item><term id="abb2">ARR</term><def><p>annualized relapse rate</p></def></def-item><def-item><term id="abb3">CDISC</term><def><p>Clinical Data Interchange Standards Consortium</p></def></def-item><def-item><term id="abb4">CDW</term><def><p>confirmed disability worsening</p></def></def-item><def-item><term id="abb5">CT-GAN</term><def><p>Conditional Transformation-Generative Adversarial Network</p></def></def-item><def-item><term id="abb6">FAMD</term><def><p>factorial analysis of mixed data</p></def></def-item><def-item><term id="abb7">GAN</term><def><p>generative adversarial network</p></def></def-item><def-item><term id="abb8">GdE</term><def><p>gadolinium-enhancing</p></def></def-item><def-item><term id="abb9">HR</term><def><p>hidden rate</p></def></def-item><def-item><term id="abb10">IPD</term><def><p>individual patient data</p></def></def-item><def-item><term id="abb11">LC</term><def><p>local cloaking</p></def></def-item><def-item><term id="abb12">MRI</term><def><p>magnetic resonance imaging</p></def></def-item><def-item><term id="abb13">MS</term><def><p>multiple sclerosis</p></def></def-item><def-item><term id="abb14">Peg-IFN&#x03B2;</term><def><p>peginterferon beta</p></def></def-item><def-item><term id="abb15">RCT</term><def><p>randomized clinical trial</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Warnke</surname><given-names>C</given-names> </name><name name-style="western"><surname>Hartung</surname><given-names>HP</given-names> </name></person-group><article-title>Big data in MS&#x2014;What can we learn from large international observational studies such as MSBase?</article-title><source>Mult Scler</source><year>2020</year><month>01</month><volume>26</volume><issue>1</issue><fpage>4</fpage><lpage>5</lpage><pub-id pub-id-type="doi">10.1177/1352458519868982</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kappos</surname><given-names>L</given-names> </name><name name-style="western"><surname>Wolinsky</surname><given-names>JS</given-names> </name><name name-style="western"><surname>Giovannoni</surname><given-names>G</given-names> </name><etal/></person-group><article-title>Contribution of relapse-independent progression vs relapse-associated worsening to overall confirmed disability accumulation in typical relapsing multiple sclerosis in a pooled analysis of 2 randomized clinical trials</article-title><source>JAMA Neurol</source><year>2020</year><month>09</month><day>1</day><volume>77</volume><issue>9</issue><fpage>1132</fpage><lpage>1140</lpage><pub-id pub-id-type="doi">10.1001/jamaneurol.2020.1568</pub-id><pub-id pub-id-type="medline">32511687</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="web"><article-title>Center for global clinical research data</article-title><source>Vivli</source><access-date>2023-11-04</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://vivli.org/">https://vivli.org/</ext-link></comment></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="web"><source>ClinicalStudyDataRequest.com</source><access-date>2023-11-04</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://clinicalstudydatarequest.com/">https://clinicalstudydatarequest.com/</ext-link></comment></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Caro</surname><given-names>JJ</given-names> </name><name name-style="western"><surname>Ishak</surname><given-names>KJ</given-names> </name></person-group><article-title>No head-to-head trial? Simulate the missing arms</article-title><source>Pharmacoeconomics</source><year>2010</year><volume>28</volume><issue>10</issue><fpage>957</fpage><lpage>967</lpage><pub-id pub-id-type="doi">10.2165/11537420-000000000-00000</pub-id><pub-id pub-id-type="medline">20831304</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Signorovitch</surname><given-names>JE</given-names> </name><name name-style="western"><surname>Sikirica</surname><given-names>V</given-names> </name><name name-style="western"><surname>Erder</surname><given-names>MH</given-names> </name><etal/></person-group><article-title>Matching-adjusted indirect comparisons: a new tool for timely comparative effectiveness research</article-title><source>Value Health</source><year>2012</year><volume>15</volume><issue>6</issue><fpage>940</fpage><lpage>947</lpage><pub-id pub-id-type="doi">10.1016/j.jval.2012.05.004</pub-id><pub-id pub-id-type="medline">22999145</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Pittock</surname><given-names>SJ</given-names> </name><name name-style="western"><surname>Barnett</surname><given-names>M</given-names> </name><name name-style="western"><surname>Bennett</surname><given-names>JL</given-names> </name><etal/></person-group><article-title>Ravulizumab in aquaporin-4-positive neuromyelitis optica spectrum disorder</article-title><source>Ann Neurol</source><year>2023</year><month>06</month><volume>93</volume><issue>6</issue><fpage>1053</fpage><lpage>1068</lpage><pub-id pub-id-type="doi">10.1002/ana.26626</pub-id><pub-id pub-id-type="medline">36866852</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Eichler</surname><given-names>HG</given-names> </name><name name-style="western"><surname>Abadie</surname><given-names>E</given-names> </name><name name-style="western"><surname>Breckenridge</surname><given-names>A</given-names> </name><name name-style="western"><surname>Leufkens</surname><given-names>H</given-names> </name><name name-style="western"><surname>Rasi</surname><given-names>G</given-names> </name></person-group><article-title>Open clinical trial data for all? A view from regulators</article-title><source>PLoS Med</source><year>2012</year><volume>9</volume><issue>4</issue><fpage>e1001202</fpage><pub-id pub-id-type="doi">10.1371/journal.pmed.1001202</pub-id><pub-id pub-id-type="medline">22505851</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="web"><article-title>Opinion 05/2014 on anonymisation techniques</article-title><source>Data Protection Working Party</source><year>2014</year><access-date>2023-11-23</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://ec.europa.eu/justice/article-29/documentation/opinion-recommendation/files/2014/wp216_en.pdf">https://ec.europa.eu/justice/article-29/documentation/opinion-recommendation/files/2014/wp216_en.pdf</ext-link></comment></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Rocher</surname><given-names>L</given-names> </name><name name-style="western"><surname>Hendrickx</surname><given-names>JM</given-names> </name><name name-style="western"><surname>de Montjoye</surname><given-names>YA</given-names> </name></person-group><article-title>Estimating the success of re-identifications in incomplete datasets using generative models</article-title><source>Nat Commun</source><year>2019</year><month>07</month><day>23</day><volume>10</volume><issue>1</issue><fpage>3069</fpage><pub-id pub-id-type="doi">10.1038/s41467-019-10933-3</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="web"><article-title>L&#x2019;anonymisation de donn&#x00E9;es personnelles</article-title><source>Commission Nationale de l&#x2019;informatique et des libert&#x00E9;s</source><year>2020</year><access-date>2023-12-12</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.cnil.fr/fr/lanonymisation-de-donnees-personnelles">https://www.cnil.fr/fr/lanonymisation-de-donnees-personnelles</ext-link></comment></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Giuffr&#x00E8;</surname><given-names>M</given-names> </name><name name-style="western"><surname>Shung</surname><given-names>DL</given-names> </name></person-group><article-title>Harnessing the power of synthetic data in healthcare: innovation, application, and privacy</article-title><source>npj Digit Med</source><year>2023</year><month>10</month><day>9</day><volume>6</volume><issue>1</issue><fpage>1</fpage><lpage>8</lpage><pub-id pub-id-type="doi">10.1038/s41746-023-00927-3</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Zhu</surname><given-names>JY</given-names> </name><name name-style="western"><surname>Kr&#x00E4;henb&#x00FC;hl</surname><given-names>P</given-names> </name><name name-style="western"><surname>Shechtman</surname><given-names>E</given-names> </name><name name-style="western"><surname>Efros</surname><given-names>AA</given-names> </name></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Leibe</surname><given-names>B</given-names> </name><name name-style="western"><surname>Matas</surname><given-names>J</given-names> </name><name name-style="western"><surname>Sebe</surname><given-names>N</given-names> </name><name name-style="western"><surname>Welling</surname><given-names>M</given-names> </name></person-group><article-title>Generative visual manipulation on the natural image manifold</article-title><source>Comput Vis &#x2013; ECCV 2016</source><year>2016</year><publisher-name>Springer International Publishing</publisher-name><fpage>597</fpage><lpage>613</lpage><pub-id pub-id-type="doi">10.1007/978-3-319-46454-1_36</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Thirunavukarasu</surname><given-names>AJ</given-names> </name><name name-style="western"><surname>Ting</surname><given-names>DSJ</given-names> </name><name name-style="western"><surname>Elangovan</surname><given-names>K</given-names> </name><name name-style="western"><surname>Gutierrez</surname><given-names>L</given-names> </name><name name-style="western"><surname>Tan</surname><given-names>TF</given-names> </name><name name-style="western"><surname>Ting</surname><given-names>DSW</given-names> </name></person-group><article-title>Large language models in medicine</article-title><source>Nat Med</source><year>2023</year><month>08</month><volume>29</volume><issue>8</issue><fpage>1930</fpage><lpage>1940</lpage><pub-id pub-id-type="doi">10.1038/s41591-023-02448-8</pub-id><pub-id pub-id-type="medline">37460753</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Demuth</surname><given-names>S</given-names> </name><name name-style="western"><surname>Paris</surname><given-names>J</given-names> </name><name name-style="western"><surname>Faddeenkov</surname><given-names>I</given-names> </name><name name-style="western"><surname>De S&#x00E8;ze</surname><given-names>J</given-names> </name><name name-style="western"><surname>Gourraud</surname><given-names>PA</given-names> </name></person-group><article-title>Clinical applications of deep learning in neuroinflammatory diseases: a scoping review</article-title><source>Rev Neurol (Paris)</source><year>2025</year><month>03</month><volume>181</volume><issue>3</issue><fpage>135</fpage><lpage>155</lpage><pub-id pub-id-type="doi">10.1016/j.neurol.2024.04.004</pub-id><pub-id pub-id-type="medline">38772806</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>El Emam</surname><given-names>K</given-names> </name></person-group><article-title>Seven ways to evaluate the utility of synthetic data</article-title><source>IEEE Secur Privacy</source><year>2020</year><month>07</month><volume>18</volume><issue>4</issue><fpage>56</fpage><lpage>59</lpage><pub-id pub-id-type="doi">10.1109/MSEC.2020.2992821</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Yale</surname><given-names>A</given-names> </name><name name-style="western"><surname>Dash</surname><given-names>S</given-names> </name><name name-style="western"><surname>Dutta</surname><given-names>R</given-names> </name><name name-style="western"><surname>Guyon</surname><given-names>I</given-names> </name><name name-style="western"><surname>Pavao</surname><given-names>A</given-names> </name><name name-style="western"><surname>Bennett</surname><given-names>KP</given-names> </name></person-group><article-title>Generation and evaluation of privacy preserving synthetic health data</article-title><source>Neurocomputing</source><year>2020</year><month>11</month><volume>416</volume><fpage>244</fpage><lpage>255</lpage><pub-id pub-id-type="doi">10.1016/j.neucom.2019.12.136</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Chen</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Esmaeilzadeh</surname><given-names>P</given-names> </name></person-group><article-title>Generative AI in medical practice: in-depth exploration of privacy and security challenges</article-title><source>J Med Internet Res</source><year>2024</year><month>03</month><day>8</day><volume>26</volume><fpage>e53008</fpage><pub-id pub-id-type="doi">10.2196/53008</pub-id><pub-id pub-id-type="medline">38457208</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="web"><article-title>Roundtable of G7 data protection and privacy authorities statement on generative AI</article-title><source>G7 Data Protection and Privacy Authorities</source><year>2023</year><access-date>2025-09-05</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.cnil.fr/sites/cnil/files/2023-06/g7roundtable_202306_statement.pdf">https://www.cnil.fr/sites/cnil/files/2023-06/g7roundtable_202306_statement.pdf</ext-link></comment></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sun</surname><given-names>H</given-names> </name><name name-style="western"><surname>Zhu</surname><given-names>T</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Jin</surname><given-names>D</given-names> </name><name name-style="western"><surname>Xiong</surname><given-names>P</given-names> </name><name name-style="western"><surname>Zhou</surname><given-names>W</given-names> </name></person-group><article-title>Adversarial attacks against deep generative models on data: a survey</article-title><source>IEEE Trans Knowl Data Eng</source><year>2023</year><month>04</month><volume>35</volume><issue>4</issue><fpage>3367</fpage><lpage>3388</lpage><pub-id pub-id-type="doi">10.1109/TKDE.2021.3130903</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>El Emam</surname><given-names>K</given-names> </name><name name-style="western"><surname>Mosquera</surname><given-names>L</given-names> </name><name name-style="western"><surname>Fang</surname><given-names>X</given-names> </name></person-group><article-title>Validating a membership disclosure metric for synthetic health data</article-title><source>JAMIA Open</source><year>2022</year><month>12</month><volume>5</volume><issue>4</issue><fpage>ooac083</fpage><pub-id pub-id-type="doi">10.1093/jamiaopen/ooac083</pub-id><pub-id pub-id-type="medline">36238080</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Pappalardo</surname><given-names>F</given-names> </name><name name-style="western"><surname>Russo</surname><given-names>G</given-names> </name><name name-style="western"><surname>Tshinanu</surname><given-names>FM</given-names> </name><name name-style="western"><surname>Viceconti</surname><given-names>M</given-names> </name></person-group><article-title>In silico clinical trials: concepts and early adoptions</article-title><source>Brief Bioinform</source><year>2019</year><month>09</month><day>27</day><volume>20</volume><issue>5</issue><fpage>1699</fpage><lpage>1708</lpage><pub-id pub-id-type="doi">10.1093/bib/bby043</pub-id><pub-id pub-id-type="medline">29868882</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Badano</surname><given-names>A</given-names> </name><name name-style="western"><surname>Graff</surname><given-names>CG</given-names> </name><name name-style="western"><surname>Badal</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Evaluation of digital breast tomosynthesis as replacement of full-field digital mammography using an in silico imaging trial</article-title><source>JAMA Netw Open</source><year>2018</year><month>11</month><day>2</day><volume>1</volume><issue>7</issue><fpage>e185474</fpage><pub-id pub-id-type="doi">10.1001/jamanetworkopen.2018.5474</pub-id><pub-id pub-id-type="medline">30646401</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sips</surname><given-names>FLP</given-names> </name><name name-style="western"><surname>Pappalardo</surname><given-names>F</given-names> </name><name name-style="western"><surname>Russo</surname><given-names>G</given-names> </name><name name-style="western"><surname>Bursi</surname><given-names>R</given-names> </name></person-group><article-title>In silico clinical trials for relapsing-remitting multiple sclerosis with MS TreatSim</article-title><source>BMC Med Inform Decis Mak</source><year>2022</year><month>11</month><day>15</day><volume>22</volume><issue>Suppl 6</issue><fpage>294</fpage><pub-id pub-id-type="doi">10.1186/s12911-022-02034-x</pub-id><pub-id pub-id-type="medline">36380294</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="web"><article-title>Synthetic SDTM sample dataset</article-title><source>GitHub</source><access-date>2023-07-19</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://github.com/lhncbc/r-snippets-bmi/tree/master/cdisc/inst/extdata/cdisc01/csv">https://github.com/lhncbc/r-snippets-bmi/tree/master/cdisc/inst/extdata/cdisc01/csv</ext-link></comment></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Azizi</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Zheng</surname><given-names>C</given-names> </name><name name-style="western"><surname>Mosquera</surname><given-names>L</given-names> </name><name name-style="western"><surname>Pilote</surname><given-names>L</given-names> </name><name name-style="western"><surname>El Emam</surname><given-names>K</given-names> </name><name name-style="western"><surname>Collaborators</surname><given-names>GF</given-names> </name></person-group><article-title>Can synthetic data be a proxy for real clinical trial data? A validation study</article-title><source>BMJ Open</source><year>2021</year><month>04</month><day>16</day><volume>11</volume><issue>4</issue><fpage>e043497</fpage><pub-id pub-id-type="doi">10.1136/bmjopen-2020-043497</pub-id><pub-id pub-id-type="medline">33863713</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>El Kababji</surname><given-names>S</given-names> </name><name name-style="western"><surname>Mitsakakis</surname><given-names>N</given-names> </name><name name-style="western"><surname>Jonker</surname><given-names>E</given-names> </name><etal/></person-group><article-title>Augmenting insufficiently accruing oncology clinical trials using generative models: validation study</article-title><source>J Med Internet Res</source><year>2025</year><month>03</month><day>5</day><volume>27</volume><fpage>e66821</fpage><pub-id pub-id-type="doi">10.2196/66821</pub-id><pub-id pub-id-type="medline">40053790</pub-id></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Akiya</surname><given-names>I</given-names> </name><name name-style="western"><surname>Ishihara</surname><given-names>T</given-names> </name><name name-style="western"><surname>Yamamoto</surname><given-names>K</given-names> </name></person-group><article-title>Comparison of synthetic data generation techniques for control group survival data in oncology clinical trials: simulation study</article-title><source>JMIR Med Inform</source><year>2024</year><month>06</month><day>18</day><volume>12</volume><fpage>e55118</fpage><pub-id pub-id-type="doi">10.2196/55118</pub-id><pub-id pub-id-type="medline">38889082</pub-id></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hern&#x00E1;n</surname><given-names>MA</given-names> </name><name name-style="western"><surname>Robins</surname><given-names>JM</given-names> </name></person-group><article-title>Using big data to emulate a target trial when a randomized trial is not available</article-title><source>Am J Epidemiol</source><year>2016</year><month>04</month><day>15</day><volume>183</volume><issue>8</issue><fpage>758</fpage><lpage>764</lpage><pub-id pub-id-type="doi">10.1093/aje/kwv254</pub-id><pub-id pub-id-type="medline">26994063</pub-id></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Popat</surname><given-names>S</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>SV</given-names> </name><name name-style="western"><surname>Scheuer</surname><given-names>N</given-names> </name><etal/></person-group><article-title>Addressing challenges with real-world synthetic control arms to demonstrate the comparative effectiveness of Pralsetinib in non-small cell lung cancer</article-title><source>Nat Commun</source><year>2022</year><month>06</month><day>17</day><volume>13</volume><issue>1</issue><fpage>3500</fpage><pub-id pub-id-type="doi">10.1038/s41467-022-30908-1</pub-id><pub-id pub-id-type="medline">35715405</pub-id></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hernadez</surname><given-names>M</given-names> </name><name name-style="western"><surname>Epelde</surname><given-names>G</given-names> </name><name name-style="western"><surname>Alberdi</surname><given-names>A</given-names> </name><name name-style="western"><surname>Cilla</surname><given-names>R</given-names> </name><name name-style="western"><surname>Rankin</surname><given-names>D</given-names> </name></person-group><article-title>Synthetic tabular data evaluation in the health domain covering resemblance, utility, and privacy dimensions</article-title><source>Methods Inf Med</source><year>2023</year><month>06</month><volume>62</volume><issue>S 01</issue><fpage>e19</fpage><lpage>e38</lpage><pub-id pub-id-type="doi">10.1055/s-0042-1760247</pub-id><pub-id pub-id-type="medline">36623830</pub-id></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Demuth</surname><given-names>S</given-names> </name><name name-style="western"><surname>De S&#x00E8;ze</surname><given-names>J</given-names> </name><name name-style="western"><surname>Edan</surname><given-names>G</given-names> </name><name name-style="western"><surname>Ziemssen</surname><given-names>T</given-names> </name><name name-style="western"><surname>Simon</surname><given-names>F</given-names> </name><name name-style="western"><surname>Gourraud</surname><given-names>PA</given-names> </name></person-group><article-title>Digital representation of patients as medical digital twins: data-centric viewpoint</article-title><source>JMIR Med Inform</source><year>2025</year><month>01</month><day>28</day><volume>13</volume><issue>1</issue><fpage>e53542</fpage><pub-id pub-id-type="doi">10.2196/53542</pub-id><pub-id pub-id-type="medline">39881430</pub-id></nlm-citation></ref><ref id="ref33"><label>33</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Mosquera</surname><given-names>L</given-names> </name><name name-style="western"><surname>El Emam</surname><given-names>K</given-names> </name><name name-style="western"><surname>Ding</surname><given-names>L</given-names> </name><etal/></person-group><article-title>A method for generating synthetic longitudinal health data</article-title><source>BMC Med Res Methodol</source><year>2023</year><month>03</month><day>23</day><volume>23</volume><issue>1</issue><fpage>67</fpage><pub-id pub-id-type="doi">10.1186/s12874-023-01869-w</pub-id><pub-id pub-id-type="medline">36959532</pub-id></nlm-citation></ref><ref id="ref34"><label>34</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Nikolentzos</surname><given-names>G</given-names> </name><name name-style="western"><surname>Vazirgiannis</surname><given-names>M</given-names> </name><name name-style="western"><surname>Xypolopoulos</surname><given-names>C</given-names> </name><name name-style="western"><surname>Lingman</surname><given-names>M</given-names> </name><name name-style="western"><surname>Brandt</surname><given-names>EG</given-names> </name></person-group><article-title>Synthetic electronic health records generated with variational graph autoencoders</article-title><source>NPJ Digit Med</source><year>2023</year><month>04</month><day>29</day><volume>6</volume><issue>1</issue><fpage>83</fpage><pub-id pub-id-type="doi">10.1038/s41746-023-00822-x</pub-id><pub-id pub-id-type="medline">37120594</pub-id></nlm-citation></ref><ref id="ref35"><label>35</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Emam</surname><given-names>KE</given-names> </name><name name-style="western"><surname>Mosquera</surname><given-names>L</given-names> </name><name name-style="western"><surname>Zheng</surname><given-names>C</given-names> </name></person-group><article-title>Optimizing the synthesis of clinical trial data using sequential trees</article-title><source>J Am Med Inform Assoc</source><year>2021</year><month>01</month><day>15</day><volume>28</volume><issue>1</issue><fpage>3</fpage><lpage>13</lpage><pub-id pub-id-type="doi">10.1093/jamia/ocaa249</pub-id><pub-id pub-id-type="medline">33186440</pub-id></nlm-citation></ref><ref id="ref36"><label>36</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bourou</surname><given-names>S</given-names> </name><name name-style="western"><surname>El Saer</surname><given-names>A</given-names> </name><name name-style="western"><surname>Velivassaki</surname><given-names>TH</given-names> </name><name name-style="western"><surname>Voulkidis</surname><given-names>A</given-names> </name><name name-style="western"><surname>Zahariadis</surname><given-names>T</given-names> </name></person-group><article-title>A review of tabular data synthesis using GANs on an IDS dataset</article-title><source>Information</source><year>2021</year><month>09</month><volume>12</volume><issue>9</issue><fpage>375</fpage><pub-id pub-id-type="doi">10.3390/info12090375</pub-id></nlm-citation></ref><ref id="ref37"><label>37</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Guillaudeux</surname><given-names>M</given-names> </name><name name-style="western"><surname>Rousseau</surname><given-names>O</given-names> </name><name name-style="western"><surname>Petot</surname><given-names>J</given-names> </name><etal/></person-group><article-title>Patient-centric synthetic data generation, no reason to risk re-identification in biomedical data analysis</article-title><source>NPJ Digit Med</source><year>2023</year><month>03</month><day>10</day><volume>6</volume><issue>1</issue><fpage>37</fpage><pub-id pub-id-type="doi">10.1038/s41746-023-00771-5</pub-id><pub-id pub-id-type="medline">36899082</pub-id></nlm-citation></ref><ref id="ref38"><label>38</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Zhao</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Kunar</surname><given-names>A</given-names> </name><name name-style="western"><surname>Scheer</surname><given-names>H</given-names> </name><name name-style="western"><surname>Birke</surname><given-names>R</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>LY</given-names> </name></person-group><article-title>CTAB-GAN: effective table data synthesizing</article-title><source>arXiv</source><comment>Preprint posted online on  Feb 16, 2021</comment><pub-id pub-id-type="doi">10.48550/arXiv.2102.08369</pub-id></nlm-citation></ref><ref id="ref39"><label>39</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Nowok</surname><given-names>B</given-names> </name><name name-style="western"><surname>Raab</surname><given-names>GM</given-names> </name><name name-style="western"><surname>Dibben</surname><given-names>C</given-names> </name></person-group><article-title>Synthpop: bespoke creation of synthetic data in R</article-title><source>J Stat Softw</source><year>2016</year><month>10</month><day>28</day><volume>74</volume><fpage>1</fpage><lpage>26</lpage><pub-id pub-id-type="doi">10.18637/jss.v074.i11</pub-id></nlm-citation></ref><ref id="ref40"><label>40</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Giovannoni</surname><given-names>G</given-names> </name><name name-style="western"><surname>Comi</surname><given-names>G</given-names> </name><name name-style="western"><surname>Cook</surname><given-names>S</given-names> </name><etal/></person-group><article-title>A placebo-controlled trial of oral cladribine for relapsing multiple sclerosis</article-title><source>N Engl J Med</source><year>2010</year><month>02</month><day>4</day><volume>362</volume><issue>5</issue><fpage>416</fpage><lpage>426</lpage><pub-id pub-id-type="doi">10.1056/NEJMoa0902533</pub-id><pub-id pub-id-type="medline">20089960</pub-id></nlm-citation></ref><ref id="ref41"><label>41</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Calabresi</surname><given-names>PA</given-names> </name><name name-style="western"><surname>Kieseier</surname><given-names>BC</given-names> </name><name name-style="western"><surname>Arnold</surname><given-names>DL</given-names> </name><etal/></person-group><article-title>Pegylated interferon &#x03B2;-1a for relapsing-remitting multiple sclerosis (ADVANCE): a randomised, phase 3, double-blind study</article-title><source>Lancet Neurol</source><year>2014</year><month>07</month><volume>13</volume><issue>7</issue><fpage>657</fpage><lpage>665</lpage><pub-id pub-id-type="doi">10.1016/S1474-4422(14)70068-7</pub-id><pub-id pub-id-type="medline">24794721</pub-id></nlm-citation></ref><ref id="ref42"><label>42</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>L&#x00EA;</surname><given-names>S</given-names> </name><name name-style="western"><surname>Josse</surname><given-names>J</given-names> </name><name name-style="western"><surname>Husson</surname><given-names>F</given-names> </name></person-group><article-title>FactoMineR: an R package for multivariate analysis</article-title><source>J Stat Softw</source><access-date>2025-09-05</access-date><volume>25</volume><issue>1</issue><comment><ext-link ext-link-type="uri" xlink:href="https://www.jstatsoft.org/article/view/v025i01">https://www.jstatsoft.org/article/view/v025i01</ext-link></comment><pub-id pub-id-type="doi">10.18637/jss.v025.i01</pub-id></nlm-citation></ref><ref id="ref43"><label>43</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>van Buuren</surname><given-names>S</given-names> </name><name name-style="western"><surname>Groothuis-Oudshoorn</surname><given-names>K</given-names> </name></person-group><article-title>Mice: multivariate imputation by chained equations in R</article-title><source>J Stat Softw</source><volume>45</volume><issue>3</issue><pub-id pub-id-type="doi">10.18637/jss.v045.i03</pub-id></nlm-citation></ref><ref id="ref44"><label>44</label><nlm-citation citation-type="web"><article-title>Octopize/saiph: a projection package</article-title><source>GitHub</source><access-date>2024-07-08</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://github.com/octopize/saiph">https://github.com/octopize/saiph</ext-link></comment></nlm-citation></ref><ref id="ref45"><label>45</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Giovannoni</surname><given-names>G</given-names> </name><name name-style="western"><surname>Soelberg Sorensen</surname><given-names>P</given-names> </name><name name-style="western"><surname>Cook</surname><given-names>S</given-names> </name><etal/></person-group><article-title>Efficacy of cladribine tablets in high disease activity subgroups of patients with relapsing multiple sclerosis: a post hoc analysis of the CLARITY study</article-title><source>Mult Scler</source><year>2019</year><month>05</month><volume>25</volume><issue>6</issue><fpage>819</fpage><lpage>827</lpage><pub-id pub-id-type="doi">10.1177/1352458518771875</pub-id><pub-id pub-id-type="medline">29716436</pub-id></nlm-citation></ref><ref id="ref46"><label>46</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Venables</surname><given-names>WN</given-names> </name><name name-style="western"><surname>Ripley</surname><given-names>BD</given-names> </name></person-group><article-title>Modern applied statistics with S</article-title><source>Stat Comput</source><year>2002</year><access-date>2025-09-05</access-date><publisher-name>Springer</publisher-name><comment><ext-link ext-link-type="uri" xlink:href="http://link.springer.com/10.1007/978-0-387-21706-2">http://link.springer.com/10.1007/978-0-387-21706-2</ext-link></comment><pub-id pub-id-type="doi">10.1007/978-0-387-21706-2</pub-id></nlm-citation></ref><ref id="ref47"><label>47</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Davison</surname><given-names>AC</given-names> </name><name name-style="western"><surname>Hinkley</surname><given-names>DV</given-names> </name></person-group><source>Bootstrap Methods and Their Application</source><year>1997</year><access-date>2025-09-05</access-date><publisher-name>Cambridge University Press</publisher-name><comment><ext-link ext-link-type="uri" xlink:href="https://www.cambridge.org/core/product/identifier/9780511802843/type/book">https://www.cambridge.org/core/product/identifier/9780511802843/type/book</ext-link></comment><pub-id pub-id-type="doi">10.1017/CBO9780511802843</pub-id></nlm-citation></ref><ref id="ref48"><label>48</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Pedregosa</surname><given-names>F</given-names> </name><name name-style="western"><surname>Varoquaux</surname><given-names>G</given-names> </name><name name-style="western"><surname>Gramfort</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Scikit-learn: machine learning in Python</article-title><source>J Mach Learn Res</source><year>2011</year><access-date>2025-09-05</access-date><volume>12</volume><issue>85</issue><fpage>2825</fpage><lpage>2830</lpage><comment><ext-link ext-link-type="uri" xlink:href="https://jmlr.csail.mit.edu/papers/v12/pedregosa11a.html">https://jmlr.csail.mit.edu/papers/v12/pedregosa11a.html</ext-link></comment></nlm-citation></ref><ref id="ref49"><label>49</label><nlm-citation citation-type="web"><article-title>Hello from Octopize Docs</article-title><source>Octopize</source><access-date>2024-05-23</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://docs.octopize.io/">https://docs.octopize.io/</ext-link></comment></nlm-citation></ref><ref id="ref50"><label>50</label><nlm-citation citation-type="web"><article-title>Privacy-by-design generation of two virtual clinical trials in multiple sclerosis and their release as open datasets</article-title><source>Figshare</source><access-date>2024-08-16</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://figshare.com/s/ba49ed0550fd069567e6">https://figshare.com/s/ba49ed0550fd069567e6</ext-link></comment></nlm-citation></ref><ref id="ref51"><label>51</label><nlm-citation citation-type="web"><article-title>Privacy-by-design generation of two virtual clinical trials in multiple sclerosis</article-title><source>GitLab</source><access-date>2024-08-16</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://gitlab.com/stanislas.demuth/avatars-for-randomized-clinical-trials/">https://gitlab.com/stanislas.demuth/avatars-for-randomized-clinical-trials/</ext-link></comment></nlm-citation></ref><ref id="ref52"><label>52</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lee</surname><given-names>GH</given-names> </name><name name-style="western"><surname>Shin</surname><given-names>SY</given-names> </name></person-group><article-title>Federated learning on clinical benchmark data: performance assessment</article-title><source>J Med Internet Res</source><year>2020</year><month>10</month><day>26</day><volume>22</volume><issue>10</issue><fpage>e20891</fpage><pub-id pub-id-type="doi">10.2196/20891</pub-id><pub-id pub-id-type="medline">33104011</pub-id></nlm-citation></ref><ref id="ref53"><label>53</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Azizi</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Lindner</surname><given-names>S</given-names> </name><name name-style="western"><surname>Shiba</surname><given-names>Y</given-names> </name><etal/></person-group><article-title>A comparison of synthetic data generation and federated analysis for enabling international evaluations of cardiovascular health</article-title><source>Sci Rep</source><year>2023</year><month>07</month><day>17</day><volume>13</volume><issue>1</issue><fpage>11540</fpage><pub-id pub-id-type="doi">10.1038/s41598-023-38457-3</pub-id></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Additional tables and figures.</p><media xlink:href="jmir_v27i1e71297_app1.docx" xlink:title="DOCX File, 701 KB"/></supplementary-material></app-group></back></article>