<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article" dtd-version="2.0">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JMIR</journal-id>
      <journal-id journal-id-type="nlm-ta">J Med Internet Res</journal-id>
      <journal-title>Journal of Medical Internet Research</journal-title>
      <issn pub-type="epub">1438-8871</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
      <article-id pub-id-type="publisher-id">v23i5e24236</article-id>
      <article-id pub-id-type="pmid">33998998</article-id>
      <article-id pub-id-type="doi">10.2196/24236</article-id>
      <article-categories>
        <subj-group subj-group-type="heading">
          <subject>Original Paper</subject>
        </subj-group>
        <subj-group subj-group-type="article-type">
          <subject>Original Paper</subject>
        </subj-group>
      </article-categories>
      <title-group>
        <article-title>Understanding Barriers to Novel Data Linkages: Topic Modeling of the Results of the LifeInfo Survey</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="editor">
          <name>
            <surname>Eysenbach</surname>
            <given-names>Gunther</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Nevalainen</surname>
            <given-names>Jaakko</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Ren</surname>
            <given-names>Ziyou</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Pimentel</surname>
            <given-names>Maria da Graca</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Shubina</surname>
            <given-names>Ivanna</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Buechler</surname>
            <given-names>Nicole</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib id="contrib1" contrib-type="author">
          <name name-style="western">
            <surname>Clarke</surname>
            <given-names>Holly</given-names>
          </name>
          <degrees>MA</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-1975-5679</ext-link>
        </contrib>
        <contrib id="contrib2" contrib-type="author">
          <name name-style="western">
            <surname>Clark</surname>
            <given-names>Stephen</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <xref rid="aff2" ref-type="aff">2</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-4090-6002</ext-link>
        </contrib>
        <contrib id="contrib3" contrib-type="author">
          <name name-style="western">
            <surname>Birkin</surname>
            <given-names>Mark</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <xref rid="aff2" ref-type="aff">2</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-5991-098X</ext-link>
        </contrib>
        <contrib id="contrib4" contrib-type="author">
          <name name-style="western">
            <surname>Iles-Smith</surname>
            <given-names>Heather</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff3" ref-type="aff">3</xref>
          <xref rid="aff4" ref-type="aff">4</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-0520-2694</ext-link>
        </contrib>
        <contrib id="contrib5" contrib-type="author">
          <name name-style="western">
            <surname>Glaser</surname>
            <given-names>Adam</given-names>
          </name>
          <degrees>DM</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <xref rid="aff3" ref-type="aff">3</xref>
          <xref rid="aff5" ref-type="aff">5</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-1814-5120</ext-link>
        </contrib>
        <contrib id="contrib6" contrib-type="author" corresp="yes">
          <name name-style="western">
            <surname>Morris</surname>
            <given-names>Michelle A</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <address>
            <institution>Leeds Institute for Data Analytics</institution>
            <institution>University of Leeds</institution>
            <addr-line>Level 11, Worsley Building</addr-line>
            <addr-line>Claredon Way</addr-line>
            <addr-line>Leeds, LS2 9NL</addr-line>
            <country>United Kingdom</country>
            <phone>44 0113 343 9680</phone>
            <email>M.Morris@leeds.ac.uk</email>
          </address>
          <xref rid="aff5" ref-type="aff">5</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-9325-619X</ext-link>
        </contrib>
      </contrib-group>
      <aff id="aff1">
        <label>1</label>
        <institution>Leeds Institute for Data Analytics</institution>
        <institution>University of Leeds</institution>
        <addr-line>Leeds</addr-line>
        <country>United Kingdom</country>
      </aff>
      <aff id="aff2">
        <label>2</label>
        <institution>School of Geography</institution>
        <institution>University of Leeds</institution>
        <addr-line>Leeds</addr-line>
        <country>United Kingdom</country>
      </aff>
      <aff id="aff3">
        <label>3</label>
        <institution>Leeds Teaching Hospitals NHS Trust</institution>
        <addr-line>Leeds</addr-line>
        <country>United Kingdom</country>
      </aff>
      <aff id="aff4">
        <label>4</label>
        <institution>School of Health and Society</institution>
        <institution>University of Salford</institution>
        <addr-line>Salford</addr-line>
        <country>United Kingdom</country>
      </aff>
      <aff id="aff5">
        <label>5</label>
        <institution>Leeds Institute of Medical Research</institution>
        <institution>School of Medicine</institution>
        <institution>University of Leeds</institution>
        <addr-line>Leeds</addr-line>
        <country>United Kingdom</country>
      </aff>
      <author-notes>
        <corresp>Corresponding Author: Michelle A Morris <email>M.Morris@leeds.ac.uk</email></corresp>
      </author-notes>
      <pub-date pub-type="collection">
        <month>5</month>
        <year>2021</year>
      </pub-date>
      <pub-date pub-type="epub">
        <day>17</day>
        <month>5</month>
        <year>2021</year>
      </pub-date>
      <volume>23</volume>
      <issue>5</issue>
      <elocation-id>e24236</elocation-id>
      <history>
        <date date-type="received">
          <day>18</day>
          <month>9</month>
          <year>2020</year>
        </date>
        <date date-type="rev-request">
          <day>6</day>
          <month>11</month>
          <year>2020</year>
        </date>
        <date date-type="rev-recd">
          <day>27</day>
          <month>1</month>
          <year>2021</year>
        </date>
        <date date-type="accepted">
          <day>12</day>
          <month>4</month>
          <year>2021</year>
        </date>
      </history>
      <copyright-statement>©Holly Clarke, Stephen Clark, Mark Birkin, Heather Iles-Smith, Adam Glaser, Michelle A Morris. Originally published in the Journal of Medical Internet Research (https://www.jmir.org), 17.05.2021.</copyright-statement>
      <copyright-year>2021</copyright-year>
      <license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
        <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (https://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in the Journal of Medical Internet Research, is properly cited. The complete bibliographic information, a link to the original publication on https://www.jmir.org/, as well as this copyright and license information must be included.</p>
      </license>
      <self-uri xlink:href="https://www.jmir.org/2021/5/e24236" xlink:type="simple"/>
      <abstract>
        <sec sec-type="background">
          <title>Background</title>
          <p>Novel consumer and lifestyle data, such as those collected by supermarket loyalty cards or mobile phone exercise tracking apps, offer numerous benefits for researchers seeking to understand diet- and exercise-related risk factors for diseases. However, limited research has addressed public attitudes toward linking these data with individual health records for research purposes. Data linkage, combining data from multiple sources, provides the opportunity to enhance preexisting data sets to gain new insights.</p>
        </sec>
        <sec sec-type="objective">
          <title>Objective</title>
          <p>The aim of this study is to identify key barriers to data linkage and recommend safeguards and procedures that would encourage individuals to share such data for potential future research.</p>
        </sec>
        <sec sec-type="methods">
          <title>Methods</title>
          <p>The <italic>LifeInfo Survey</italic> consulted the public on their attitudes toward sharing consumer and lifestyle data for research purposes. Where barriers to data sharing existed, participants provided unstructured survey responses detailing what would make them more likely to share data for linkage with their health records in the future. The topic modeling technique latent Dirichlet allocation was used to analyze these textual responses to uncover common thematic topics within the texts.</p>
        </sec>
        <sec sec-type="results">
          <title>Results</title>
          <p>Participants provided responses related to sharing their store loyalty card data (n=2338) and health and fitness app data (n=1531). Key barriers to data sharing identified through topic modeling included data safety and security, personal privacy, requirements of further information, fear of data being accessed by others, problems with data accuracy, not understanding the reason for data linkage, and not using services that produce these data. We provide recommendations for addressing these issues to establish the best practice for future researchers interested in using these data.</p>
        </sec>
        <sec sec-type="conclusions">
          <title>Conclusions</title>
          <p>This study formulates a large-scale consultation of public attitudes toward this kind of data linkage, which is an important first step in understanding and addressing barriers to participation in research using novel consumer and lifestyle data.</p>
        </sec>
      </abstract>
      <kwd-group>
        <kwd>topic modeling</kwd>
        <kwd>text analysis</kwd>
        <kwd>lifestyle data</kwd>
        <kwd>consumer data</kwd>
        <kwd>mHealth</kwd>
        <kwd>loyalty card</kwd>
        <kwd>fitness tracker</kwd>
        <kwd>data linkage</kwd>
        <kwd>data sharing</kwd>
        <kwd>public attitudes</kwd>
        <kwd>public opinion</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <sec sec-type="introduction">
      <title>Introduction</title>
      <sec>
        <title>Background</title>
        <p>Poor diet and physical inactivity are known to contribute to millions of early deaths worldwide [<xref ref-type="bibr" rid="ref1">1</xref>,<xref ref-type="bibr" rid="ref2">2</xref>]. In the United Kingdom, 1 in 7 deaths are attributed to poor diet, whereas 1 in 6 deaths are attributed to physical inactivity [<xref ref-type="bibr" rid="ref3">3</xref>,<xref ref-type="bibr" rid="ref4">4</xref>]. A greater understanding of these risk factors for lifestyle-influenced diseases such as type 2 diabetes, certain cancers, and cardiovascular diseases is needed to improve global health. At the same time, technological advancements have led to increasingly large volumes of <italic>big data</italic> being produced about individual food consumption and exercise habits [<xref ref-type="bibr" rid="ref5">5</xref>,<xref ref-type="bibr" rid="ref6">6</xref>].</p>
        <p>Historically, a major barrier to research on lifestyle risk factors for noncommunicable diseases has been the availability of accurate, robust, and reproducible data on diet and exercise [<xref ref-type="bibr" rid="ref7">7</xref>,<xref ref-type="bibr" rid="ref8">8</xref>]. Big and novel lifestyle data, produced when using services such as supermarket loyalty cards or health and fitness monitoring apps, have many benefits compared with more traditional forms of data collected through surveys, interviews, and food or exercise logs; as these data are collected during everyday activities, they are naturalistic and nonintrusive [<xref ref-type="bibr" rid="ref9">9</xref>], meaning they do not encounter the selective reporting bias entailed with traditional methods [<xref ref-type="bibr" rid="ref7">7</xref>]. Furthermore, large volumes of data can potentially be shared with researchers almost in real time, surpassing the scale of traditional methods at a very low cost and requiring little or no effort on the part of the participant [<xref ref-type="bibr" rid="ref10">10</xref>]. Consequently, these data are uniquely set up for at-scale longitudinal studies with the additional benefit of extending research into traditionally hard-to-reach populations [<xref ref-type="bibr" rid="ref10">10</xref>].</p>
        <p>These data include mobile phone step counts, GPS-tracked exercise, wearable device heart rate monitoring, and store loyalty card records. In health research, few studies have demonstrated the full utility of consumer and personal data of this sort, as they have not typically been available to researchers [<xref ref-type="bibr" rid="ref7">7</xref>,<xref ref-type="bibr" rid="ref10">10</xref>,<xref ref-type="bibr" rid="ref11">11</xref>]. Nonetheless, initiatives such as the Consumer Data Research Center [<xref ref-type="bibr" rid="ref12">12</xref>] have begun to facilitate access to novel data sources.</p>
        <p>In the context of diet and health research, attention has particularly been drawn to the potential of using supermarket loyalty card data (eg, Tesco Club Card) to understand food and drink purchase behavior [<xref ref-type="bibr" rid="ref7">7</xref>,<xref ref-type="bibr" rid="ref10">10</xref>,<xref ref-type="bibr" rid="ref11">11</xref>,<xref ref-type="bibr" rid="ref13">13</xref>-<xref ref-type="bibr" rid="ref16">16</xref>] and data from wearable devices (eg, Fitbit and Garmin) or mobile phone fitness apps (eg, MyFitnessPal and Strava) to understand exercise behavior [<xref ref-type="bibr" rid="ref17">17</xref>-<xref ref-type="bibr" rid="ref19">19</xref>]. However, studies that have used commercial lifestyle data for research purposes have reported low uptake [<xref ref-type="bibr" rid="ref10">10</xref>]. Although this may have been influenced by factors such as the methods used to contact participants, there is a clear research need to understand participants’ reluctance to share their data.</p>
        <p>The combination of data from multiple sources to create enhanced data sets, known as <italic>data linkage</italic>, provides new insights for health research that surpass those provided by the data sets individually. The value of consumer and lifestyle data is further amplified when combined with health outcomes data [<xref ref-type="bibr" rid="ref10">10</xref>]; for example, Aiello et al [<xref ref-type="bibr" rid="ref11">11</xref>] used supermarket loyalty card data for small geographic areas to study the association between food purchasing and health outcomes. We believe that similar work linking individual health outcomes and lifestyle data, rather than at the ecological study level, would provide added benefits through greater specificity and personalization. However, an individualized approach may highlight data privacy and ethics barriers, especially in light of understandable historical concerns regarding data linkages, such as those proposed by care.data in the United Kingdom in 2013 and 2016 [<xref ref-type="bibr" rid="ref20">20</xref>]. In addition, data linkage can create disclosure concerns or can increase sensitivity, which must be addressed by researchers seeking to use such methods [<xref ref-type="bibr" rid="ref21">21</xref>,<xref ref-type="bibr" rid="ref22">22</xref>].</p>
        <p>Public attitudes toward health data sharing for research appear to be dependent on many factors, with the most prominent being the actual data sharing process, including data security considerations [<xref ref-type="bibr" rid="ref23">23</xref>], the purpose and social license for the research [<xref ref-type="bibr" rid="ref23">23</xref>-<xref ref-type="bibr" rid="ref25">25</xref>], and the level of sensitivity attributed to data [<xref ref-type="bibr" rid="ref26">26</xref>]. Others have reported high levels of trust in health institutions, lower levels of trust in academics, and the lowest levels of trust in private companies for data sharing initiatives [<xref ref-type="bibr" rid="ref24">24</xref>,<xref ref-type="bibr" rid="ref27">27</xref>]. As research using big lifestyle data often involves all 3 of these factors, it is important to address how trust might influence people’s willingness to participate in research under these circumstances.</p>
      </sec>
      <sec>
        <title>Objectives</title>
        <p>Despite the obvious opportunities provided by the proliferation of big data for health research, little is known about public attitudes toward the linkage of lifestyle data with individual health records for research. The <italic>LifeInfo Survey</italic> is the first of its kind at this scale (n=7101 participants) to consult the public on their attitudes toward sharing novel forms of consumer and lifestyle data for linkage with their health records for health research [<xref ref-type="bibr" rid="ref28">28</xref>]. Moreover, this survey included free-text response questions that allowed individuals, in their own words, to state what actions would alleviate their concerns about data sharing in this setting. Surveys do not frequently allow for unstructured answers of this kind because of the subjectivity and time commitment imposed with the qualitative coding of texts [<xref ref-type="bibr" rid="ref29">29</xref>]. The use of novel data science methods, including topic modeling, can facilitate the semiautomated analysis of large amounts of textual data to identify latent themes [<xref ref-type="bibr" rid="ref30">30</xref>].</p>
        <p>This study aims to advance the understanding of attitudes toward data sharing by identifying specific barriers that present themselves when linking store loyalty cards or health and fitness app data with individual health records for research purposes. We recommend procedures and safeguards that can be applied to future research linking lifestyle and health data to increase participant support.</p>
        <p>We hypothesize that common issues identified in the literature on public attitudes toward data sharing for health research, such as data security and trust, will be evident within survey responses in addition to concerns specific to the type of data to be linked, in this case, store loyalty card and health and fitness app data.</p>
      </sec>
    </sec>
    <sec sec-type="methods">
      <title>Methods</title>
      <sec>
        <title>Data Collection</title>
        <p>To gather public opinion on data sharing and linkage, the <italic>LifeInfo Survey</italic> recruited participants between September 2017 and October 2019 across 2 health settings, the Leeds Teaching Hospitals National Health Service Trust and Low Moor Medical General Practice Surgery, and 2 nonhealth settings, the Leeds Institute for Data Analytics and the Leeds City Council Health Communities survey [<xref ref-type="bibr" rid="ref31">31</xref>]. The <italic>LifeInfo Survey</italic> (<xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>) addressed a hypothetical scenario—whether, if asked for a future study, respondents would give permission for their consumer and lifestyle data to be linked with their health records for health research. This was conditional on their data being stored safely and not shared with anyone outside the research team. The survey consulted participants specifically about 2 types of data: (1) consumer data from store loyalty cards detailing food and drink purchases and (2) lifestyle data from health and fitness apps, websites, and wearable devices. Basic demographic data and which specific loyalty cards and health and fitness apps respondents used were additionally captured. Additional information about the <italic>LifeInfo</italic> project can be found in the study protocol [<xref ref-type="bibr" rid="ref32">32</xref>]. Detailed information on the participants and the main survey results are reported elsewhere [<xref ref-type="bibr" rid="ref28">28</xref>].</p>
        <p>Those who responded <italic>no</italic> or <italic>not sure</italic> to whether they would share their data were asked, “what (if anything) might make you change your mind in the future?” concerning their (1) store loyalty card data and (2) health and fitness app data. This study primarily analyzes the qualitative responses to these 2 questions (questions 4 and 9 in the original questionnaire included in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>), henceforth referred to as (1) <italic>the store loyalty card question</italic> and (2) <italic>the health and fitness app question</italic>, to answer the following research question: “What are the reported barriers to linkage of lifestyle data with health records for research?” Responses were given in free-text format, allowing individuals to state, in their own words, potential desired changes that would make them more willing to share such data, although many also used this space to explain their reasons behind more negative responses. Primary analysis regarding overall willingness to share lifestyle and data demographic trends are reported elsewhere [<xref ref-type="bibr" rid="ref28">28</xref>] and summarized below. The survey questionnaire and all responses were provided in English.</p>
      </sec>
      <sec>
        <title>Ethics</title>
        <p>This study was granted ethical approval by the London-Brent Research Ethics Committee (reference 17/LO/0622).</p>
      </sec>
      <sec>
        <title>Modeling</title>
        <p>Latent Dirichlet allocation (LDA) was applied as a method of automated content analysis on unstructured survey responses. This technique was used to identify the underlying factors that contribute to respondents’ unwillingness or unsureness to having their consumer or lifestyle data and health records linked for research purposes and potential changes that could influence them to do so. LDA is a generative probabilistic model that is frequently applied to textual data. The model has a 3-level hierarchical Bayesian structure under which each <italic>document</italic> is modeled as several topics, and each topic is modeled as a set of terms [<xref ref-type="bibr" rid="ref30">30</xref>]. The model uses the Gibbs sampling technique to estimate model parameters. The LDA modeling procedure was applied to free-text responses separately for the store loyalty card question and the health and fitness app question to create a model for each.</p>
        <sec>
          <title>Processing</title>
          <p>Data cleaning and processing were performed using the R software [<xref ref-type="bibr" rid="ref33">33</xref>]. Noninformative responses (eg, <italic>N/A</italic> and <italic>no comment</italic>) were removed from the data set. Survey responses that consisted of only a single word were removed from the analysis data set, as the underlying mechanisms of LDA are based on the co-occurrence of terms. These are categorized separately, as attitudes are easily ascertained from single-word responses (shown in <xref ref-type="table" rid="table1">Tables 1</xref> and <xref ref-type="table" rid="table2">2</xref>).</p>
          <table-wrap position="float" id="table1">
            <label>Table 1</label>
            <caption>
              <p>Counts of single-word responses to the question, “What (if anything) might make you change your mind in the future?” about sharing store loyalty card data for linkage with health records (n=396).</p>
            </caption>
            <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
              <col width="750"/>
              <col width="250"/>
              <thead>
                <tr valign="top">
                  <td>Single word</td>
                  <td>Counts per word, n (%)</td>
                </tr>
              </thead>
              <tbody>
                <tr valign="top">
                  <td>Nothing</td>
                  <td>256 (64.6)</td>
                </tr>
                <tr valign="top">
                  <td>No</td>
                  <td>98 (24.8)</td>
                </tr>
                <tr valign="top">
                  <td>Privacy</td>
                  <td>10 (2.5)</td>
                </tr>
                <tr valign="top">
                  <td>None</td>
                  <td>8 (2.0)</td>
                </tr>
                <tr valign="top">
                  <td>Maybe</td>
                  <td>5 (1.2)</td>
                </tr>
                <tr valign="top">
                  <td>Private</td>
                  <td>3 (0.8)</td>
                </tr>
                <tr valign="top">
                  <td>Confidentiality; personal</td>
                  <td>2 (0.5)</td>
                </tr>
                <tr valign="top">
                  <td>Confidential; discounts; dk<sup>a</sup>; hackers; illegible; incentives; money; unneeded; unlikely; unsure; why; yes</td>
                  <td>1 (0.3)</td>
                </tr>
              </tbody>
            </table>
            <table-wrap-foot>
              <fn id="table1fn1">
                <p><sup>a</sup>dk: don’t know.</p>
              </fn>
            </table-wrap-foot>
          </table-wrap>
          <table-wrap position="float" id="table2">
            <label>Table 2</label>
            <caption>
              <p>Counts of single-word responses to the question, “What (if anything) might make you change your mind in the future?” about sharing fitness app data for linkage with health records (n=309).</p>
            </caption>
            <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
              <col width="750"/>
              <col width="250"/>
              <thead>
                <tr valign="top">
                  <td>Single word</td>
                  <td>Count per word, n (%)</td>
                </tr>
              </thead>
              <tbody>
                <tr valign="top">
                  <td>Nothing</td>
                  <td>189 (61.2)</td>
                </tr>
                <tr valign="top">
                  <td>No</td>
                  <td>71 (23.0)</td>
                </tr>
                <tr valign="top">
                  <td>Privacy; security</td>
                  <td>7 (2.3)</td>
                </tr>
                <tr valign="top">
                  <td>Maybe; none</td>
                  <td>5 (1.6)</td>
                </tr>
                <tr valign="top">
                  <td>q4<sup>a</sup></td>
                  <td>4 (1.3)</td>
                </tr>
                <tr valign="top">
                  <td>Anonymity; confidential; private; same; yes</td>
                  <td>2 (0.6)</td>
                </tr>
                <tr valign="top">
                  <td>Benefit; confidentiality; intrusive; might; nil; personal; possibly; relevance; uncertain; unlikely; why</td>
                  <td>1 (0.3)</td>
                </tr>
              </tbody>
            </table>
            <table-wrap-foot>
              <fn id="table2fn1">
                <p><sup>a</sup>q4: question 4.</p>
              </fn>
            </table-wrap-foot>
          </table-wrap>
          <p>Preprocessing procedures, standard in natural language processing, were undertaken to create a <italic>document term matrix</italic> (DTM) on which to perform LDA. This included converting all words to lower case and removing white spaces, punctuations, and common <italic>stop-words</italic> from texts to leave only meaningful words. Frequent misspellings were replaced with their correct form, and common equivalent words were standardized (<xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>).</p>
          <p>The <italic>SMART</italic> (System for the Mechanical Analysis and Retrieval of Text) stop-word data set was used to identify uninformative words. However, bigrams (two-word terms) that contain many common stop-words were not removed to uncover attitude positions within responses (<xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>). Numbers were not removed as numeric bigrams are potentially meaningful (eg, <italic>100 percent</italic> and <italic>3rd party</italic>). Finally, lemmatization of words was undertaken to convert words into their root form (<italic>lemma</italic>) and reduce sparsity within the DTM (eg, <italic>cards</italic> was replaced with <italic>card</italic>; <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>). Lemmatization was preferred over stemming, as lemmas are more human-readable than stems, which are not always complete words (eg, <italic>storing</italic> would be replaced with the stem <italic>stor</italic>).</p>
          <p>LDA was performed on a DTM of unigrams (one-word terms) and bigrams (two-word terms) within the texts. Bigrams are included in the DTM as this creates more human-interpretable topics, and many words are context-specific, for example, <italic>big brother</italic>/<italic>change mind</italic>, or formulate attitude positions in combination, for example, <italic>would change</italic>/<italic>wouldn’t change</italic>. In the example given, the term <italic>brother</italic> alone would provide little insight into data sharing attitudes, yet <italic>big brother</italic> signifies a potential invasion of privacy and distrust. Several papers have reported improved results for including bigrams and higher n-grams in different topic models [<xref ref-type="bibr" rid="ref34">34</xref>-<xref ref-type="bibr" rid="ref36">36</xref>]. Only terms that occurred in more than 2 documents were retained, leaving 993 unique terms for the store loyalty card question and 554 unique terms for the health and fitness app question.</p>
        </sec>
        <sec>
          <title>Topic Number Selection</title>
          <p>Selecting an appropriate number of topics is a key challenge for LDA. According to Green et al [<xref ref-type="bibr" rid="ref37">37</xref>], “too few topics will produce results that are overly broad, whereas choosing too many will result in the ‘over-clustering’ of a corpus into many small, highly-similar topics” that are difficult to interpret in a meaningful way. The number of topics (k) is conventionally chosen as the model with the lowest value of <italic>perplexity</italic> when applying different models of candidate k to held-out data [<xref ref-type="bibr" rid="ref30">30</xref>,<xref ref-type="bibr" rid="ref38">38</xref>]. This perplexity measure captures how well a probability distribution or probability model predicts a sample, indicating how <italic>surprised</italic> the model is by new data. For LDA, it is equivalent to the inverse of the geometric mean per-word likelihood calculated on the held-out data [<xref ref-type="bibr" rid="ref30">30</xref>]. For this analysis, ten-fold crossvalidation was used to select an appropriate number of topics (k) from a candidate list of 15 k ranging from 2 to 100, optimizing for perplexity. Candidate k increases in smaller intervals between lower values, as greater change is expected between these values. The crossvalidation process randomly divides the data set into 10 approximately equally sized folds and uses 9 of these to train the model, using the held-out fold to test the model. This process was repeated 10 times such that each fold was used as the testing set once.</p>
          <p>Some research has found that the models that produce the most semantically meaningful topics—in that topics are easily interpreted by humans and terms representing concepts are given high probabilities within the model—are not necessarily the models with the best perplexity scores [<xref ref-type="bibr" rid="ref39">39</xref>]. Hence, a measurement of <italic>coherence</italic>, which research finds corresponds well with human-interpretable topics [<xref ref-type="bibr" rid="ref40">40</xref>], was also considered. Average topic <italic>probabilistic coherence</italic> measures topic quality based on how commonly topic terms co-occur, controlling for statistical independence [<xref ref-type="bibr" rid="ref41">41</xref>-<xref ref-type="bibr" rid="ref43">43</xref>]. This was compared for models of candidate k topics and balanced with perplexity scores.</p>
          <p>Once an appropriate number of topics were selected for each LDA model, the final models were created using responses from all individuals for the given question. This method was chosen, rather than using commonly used training and testing approaches, to support the study aim of summarizing survey responses rather than creating a predictive model to categorize new data, as no new <italic>LifeInfo Survey</italic> data will be collected in the future. Moreover, the size of the data set is small compared with many others that use LDA, and splitting the data set into fewer responses could reduce the model quality.</p>
          <p>For both topic number selection and the final models, the LDA hyperparameters were set at α=.1, influencing document-topic density, and β=.05, influencing word-topic density. The α prior was set at this relatively low value because <italic>LifeInfo Survey</italic> responses were short (refer to <xref ref-type="supplementary-material" rid="app3">Multimedia Appendix 3</xref> for plots showing the distribution of response lengths in words, averaging 12 words per document for the store loyalty card question and 9 words for the health and fitness app question), so we would expect there to be only a few topics formulating each document. β was also set to a relatively low value, as we expected a small number of words to be highly influential per topic given the short responses. α is modeled as asymmetrical, as we expected some topics to be more common than others within the survey responses. Previous work has found that asymmetrical α values provide substantial advantages to LDA results, whereas asymmetrical β priors provide no benefit [<xref ref-type="bibr" rid="ref44">44</xref>].</p>
        </sec>
        <sec>
          <title>Hierarchical Clustering</title>
          <p>To make the topics more easily interpretable, those created through LDA modeling were further categorized into thematic groups with the aid of hierarchical clustering of topics (<xref ref-type="supplementary-material" rid="app4">Multimedia Appendix 4</xref>). These topics were given summarizing names, shown in <xref ref-type="table" rid="table3">Tables 3</xref> and <xref ref-type="table" rid="table4">4</xref> by subheadings in bold, based on their content, considering both topic <italic>top terms</italic> and the contextual use of these terms in texts (<xref ref-type="supplementary-material" rid="app5">Multimedia Appendix 5</xref>). The Hellinger distance [<xref ref-type="bibr" rid="ref45">45</xref>] between topics was calculated based on their term ϕ values, and the 2 closest topics were clustered together. ϕ values are the probability of a term being used within a text given; therefore, topics that more frequently use the same terms are considered closer. This was done iteratively until hierarchical clusters split topics into meaningful thematic categories. In some cases, topics thematically aligned with a category that was found to be semantically different to the topic according to hierarchical clustering; these are marked with a superscript in the tables and dendrogram and were reassigned to their appropriate thematic category.</p>
          <table-wrap position="float" id="table3">
            <label>Table 3</label>
            <caption>
              <p>Topics created by latent Dirichlet allocation modeling of LifeInfo store loyalty card question, showing topic names, top 15 terms per topic according to term ϕ values, topic prevalence, and topic probabilistic coherence. In total, 9 thematic categories are shown.</p>
            </caption>
            <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
              <col width="30"/>
              <col width="130"/>
              <col width="550"/>
              <col width="190"/>
              <col width="100"/>
              <thead>
                <tr valign="top">
                  <td colspan="2">Themes and SC<sup>a</sup> topic number</td>
                  <td>Topic name: top 15 terms selected by highest probability of the term given the topic</td>
                  <td>Prevalence (estimated survey responses; n=1930)<sup>b</sup>, n (%)</td>
                  <td>Coherence</td>
                </tr>
              </thead>
              <tbody>
                <tr valign="top">
                  <td colspan="5">
                    <bold>Nothing would change mind</bold>
                  </td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>SC14</td>
                  <td>Wouldn’t change mind: change, mind, change mind, nothing, would change, nothing would, don’t think, make, wouldn’t change, nothing change, think would, anything would, make change, think anything, and would make</td>
                  <td>134.52 (6.97)</td>
                  <td>0.47</td>
                </tr>
                <tr valign="top">
                  <td colspan="5">
                    <bold>Store loyalty card/don’t use store card</bold>
                  </td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>SC16</td>
                  <td>Store loyalty card/don’t use: card, store, loyalty, loyalty card, store card, health, store loyalty, link, card health, don’t use, use loyalty, why store, no need, card would, and use store</td>
                  <td>147.65 (7.65)</td>
                  <td>0.34</td>
                </tr>
                <tr valign="top">
                  <td colspan="5">“<bold>Big Brother” and privacy invasion</bold></td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>SC8</td>
                  <td>Big brother/nanny state: feel, privacy, big brother, brother, nothing, invasion, invasion privacy, regard, don’t feel, thing, feel like, state, choice, watch, and nanny</td>
                  <td>80.10 (4.15)</td>
                  <td>0.16</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>SC1</td>
                  <td>Privacy and cold calling: concern, privacy, concern about, confidentiality, would concern, email, require, call, future, bombard, advertise, market, guarantee, about privacy, and would require</td>
                  <td>73.15 (3.79)</td>
                  <td>0.13</td>
                </tr>
                <tr valign="top">
                  <td colspan="5">
                    <bold>Personal information sharing and access by others</bold>
                  </td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>SC2</td>
                  <td>Concerns about linkage and insurance: information, give, idea, insurance, don’t like, health, like idea, good, information could, wrong, reassurance, health insurance, hand, affect, and know about</td>
                  <td>80.87 (4.19)</td>
                  <td>0.05</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>SC19</td>
                  <td>Data access and others: information, private, personal, access, company, stored, detail, hold, information stored, sell, people, safe, personal information, personal health, and party</td>
                  <td>103.45 (5.36)</td>
                  <td>0.04</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>SC3</td>
                  <td>Don’t want to share personal information: share, information, information share, personal, share information, personal information, don’t want, nothing, detail, want information, nothing don’t, want share, wouldn’t want, don’t like, and not share</td>
                  <td>100.36 (5.20)</td>
                  <td>0.12</td>
                </tr>
                <tr valign="top">
                  <td colspan="5">
                    <bold>Data inaccuracy</bold>
                  </td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>SC7</td>
                  <td>Data inaccuracy and bias: buy, shop, make, purchase, family, people, food, eat, lifestyle, relate, supermarket, diet, healthy, product, and good</td>
                  <td>117.92 (6.11)</td>
                  <td>0.09</td>
                </tr>
                <tr valign="top">
                  <td colspan="5">
                    <bold>Data security and protection</bold>
                  </td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>SC15</td>
                  <td>Don’t trust organizations with data: data, trust, don’t trust, NHS<sup>c</sup>, share, organization, system, personal data, data share, hack, personal, guarantee, secure, not trust, and safety</td>
                  <td>95.34 (4.94)</td>
                  <td>0.09</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>SC12</td>
                  <td>Data protection: data, data protection, protection, data would, data use, access, issue, how data, health data, would use, health, wouldn’t want, data link, secure, and link</td>
                  <td>81.06 (4.20)</td>
                  <td>0.14</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>SC9</td>
                  <td>Data security: security, data, data security, breach, worry, assurance, worry about, security data, data breach, about security, information security, increase, improve, risk, and security information</td>
                  <td>82.41 (4.27)</td>
                  <td>0.06</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>SC5</td>
                  <td>Guaranteed data safety/security: secure, 100, 100 percent, percent, convince, stored, safely, safe, stored safely, prefer, separate, if could, NHS, control, and how secure</td>
                  <td>74.88 (3.88)</td>
                  <td>0.37</td>
                </tr>
                <tr valign="top">
                  <td colspan="5">
                    <bold>Understanding research purpose and process</bold>
                  </td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>SC20</td>
                  <td>Don’t understand benefit: understand, benefit, don’t understand, why would, would need, understand why, necessary, link, purpose, need understand, understand benefit, understand purpose, why need, would necessary, and need link</td>
                  <td>82.41 (4.27)</td>
                  <td>0.17</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>SC6</td>
                  <td>Require demonstratable benefits: benefit, not sure, benefit would, link, explanation, see benefit, explain, appropriate, explanation why, would benefit, explain benefit, if could, sure would, care, and sure why</td>
                  <td>69.48 (3.60)</td>
                  <td>0.03</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>SC4<sup>d</sup></td>
                  <td>Require reassurance: research, depend, data, purpose, specific, would depend, study, team, anonymize, happy, access, research team, if data, contact, and not use</td>
                  <td>92.06 (4.77)</td>
                  <td>0.07</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>SC18<sup>d</sup></td>
                  <td>Require more information: information, more information, would use, information would, would need, would want, need know, information use, know why, want know, need more, would like, why would, information about, and how would</td>
                  <td>146.68 (7.60)</td>
                  <td>0.10</td>
                </tr>
                <tr valign="top">
                  <td colspan="5">
                    <bold>Health records shouldn’t be linked</bold>
                  </td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>SC17</td>
                  <td>Health record should not be linked: health, record, health record, link, link health, nothing, private, confidential, don’t know, would link, why would, health care, nothing health, record would, and care</td>
                  <td>110.20 (5.71)</td>
                  <td>0.26</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>SC11<sup>d</sup></td>
                  <td>Shopping habits and health shouldn’t be linked: shop, health, habit, shop habit, interest, health care, link, wouldn’t want, professional, shop health, condition, business, supermarket, health professional, and commercial</td>
                  <td>73.15 (3.79)</td>
                  <td>0.10</td>
                </tr>
                <tr valign="top">
                  <td colspan="5">
                    <bold>Don’t understand reason/relevance of data linkage</bold>
                  </td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>SC13</td>
                  <td>Unsure of reason for linkage: not sure, relevant, medical, would need, unsure, sure why, record, medical record, don’t see, why would, relevant health, not relevant, sure would, medical information, and sure how</td>
                  <td>92.45 (4.79)</td>
                  <td>0.04</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>SC10</td>
                  <td>Don’t see reason for linkage: reason, don’t see, link, relevance, can’t see, reason why, nothing, see relevance, point, see why, 2, see reason, should link, connection, and good</td>
                  <td>91.87 (4.76)</td>
                  <td>0.08</td>
                </tr>
              </tbody>
            </table>
            <table-wrap-foot>
              <fn id="table3fn1">
                <p><sup>a</sup>SC: store card.</p>
              </fn>
              <fn id="table3fn2">
                <p><sup>b</sup>Total prevalence does not sum exactly to 100%, and the total survey response counts do not sum exactly to N because of rounding.</p>
              </fn>
              <fn id="table3fn3">
                <p><sup>c</sup>NHS: National Health Service.</p>
              </fn>
              <fn id="table3fn4">
                <p><sup>d</sup>Topic has been regrouped to the category most thematically aligned with its contents from the category hierarchical clustering indicated as semantically similar.</p>
              </fn>
            </table-wrap-foot>
          </table-wrap>
          <table-wrap position="float" id="table4">
            <label>Table 4</label>
            <caption>
              <p>Topics created by latent Dirichlet allocation modeling of the LifeInfo health and fitness app question.</p>
            </caption>
            <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
              <col width="30"/>
              <col width="110"/>
              <col width="570"/>
              <col width="190"/>
              <col width="0"/>
              <col width="100"/>
              <thead>
                <tr valign="top">
                  <td colspan="2">Themes and HA<sup>a</sup> topic number</td>
                  <td>Topic name: top 15 terms selected by highest probability of the term given the topic</td>
                  <td>Prevalence (estimated survey responses; n=1206)<sup>b</sup>, n (%)</td>
                  <td colspan="2">Coherence</td>
                </tr>
              </thead>
              <tbody>
                <tr valign="top">
                  <td colspan="6">
                    <bold>Nothing would change mind</bold>
                  </td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>HA17</td>
                  <td>Nothing would change mind: change, mind, change mind, nothing, would change, don’t think, nothing would, think anything, make, wouldn’t change, anything would, make change, would make, future, and think would</td>
                  <td>75.86 (6.29)</td>
                  <td colspan="2">0.46</td>
                </tr>
                <tr valign="top">
                  <td colspan="6">
                    <bold>Apps/websites/wearable devices and don’t use</bold>
                  </td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>HA16</td>
                  <td>Device and don’t use: device, use device, not use, wear, don’t use, device not, wearable, collect, future, wearable device, data collect, wear device, app, data, and device future</td>
                  <td>59.09 (4.90)</td>
                  <td colspan="2">0.10</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>HA11</td>
                  <td>App/website and don’t use: app, fitness, use app, don’t use, device, fitness app, lifestyle, app not, user, website, applicable, health app, not applicable, use fitness, and device app</td>
                  <td>53.67 (4.45)</td>
                  <td colspan="2">0.13</td>
                </tr>
                <tr valign="top">
                  <td colspan="6">“<bold>Big Brother” and privacy invasion</bold></td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>HA3</td>
                  <td>“Big Brother:” feel, good, idea, life, big brother, brother, feel like, make, don’t like, watch, like idea, bit, exercise, control, and NHS<sup>c</sup></td>
                  <td>55.8 (4.62)</td>
                  <td colspan="2">0.06</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>HA2</td>
                  <td>Privacy invasion and safety/security of data: privacy, safe, store, securely, invasion, not safe, store securely, invasion privacy, feel, hacker, issue, nothing, code, data store, and partly</td>
                  <td>53.43 (4.43)</td>
                  <td colspan="2">0.05</td>
                </tr>
                <tr valign="top">
                  <td colspan="6">
                    <bold>Personal information sharing</bold>
                  </td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>HA20</td>
                  <td>Information would be shared: information, share, information share, information wouldn’t, store, know information, share information, wouldn’t share, information store, sure information, worry, will share, health information, would share, and information will</td>
                  <td>65.12 (5.40)</td>
                  <td colspan="2">0.04</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>HA14</td>
                  <td>Information is personal: personal, information, private, access, personal information, don’t want, personal use, private information, information personal, reason, people, access information, point, long, and personal detail</td>
                  <td>61.75 (5.12)</td>
                  <td colspan="2">0.08</td>
                </tr>
                <tr valign="top">
                  <td colspan="6">
                    <bold>Who has access to these data?</bold>
                  </td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>HA15</td>
                  <td>Data access by insurance/private companies: company, insurance, lifestyle, relevant, make, insurance company, health, monitor, fitbit, will not, not relevant, point, interest, unsure, and wear</td>
                  <td>53.79 (4.46)</td>
                  <td colspan="2">0.12</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>HA7</td>
                  <td>Health records and linkage: record, health, health record, access, link, doctor, don’t want, information, access health, food, buy, not want, put, people, and link health</td>
                  <td>63.44 (5.26)</td>
                  <td colspan="2">0.20</td>
                </tr>
                <tr valign="top">
                  <td colspan="6">
                    <bold>Data inaccuracy</bold>
                  </td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>HA5</td>
                  <td>Inaccurate data and partial use: accurate, phone, not accurate, step, activity, app, hold, give, run, exercise, don’t think, record, count, picture, and walk</td>
                  <td>52.58 (4.36)</td>
                  <td colspan="2">0.11</td>
                </tr>
                <tr valign="top">
                  <td colspan="6">
                    <bold>Data security and protection</bold>
                  </td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>HA19</td>
                  <td>Not sure and security: not sure, secure, sure how, sure would, would secure, sure about, secure would, convince, illegible, situation, how secure, sure anything, sure if, would convince, and if would</td>
                  <td>53.79 (4.46)</td>
                  <td colspan="2">0.07</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>HA18</td>
                  <td>Don’t trust data security: trust, secure, nothing, don’t trust, computer, hack, fully, website, internet, not trust, nothing don’t, nothing secure, wouldn’t trust, information, and trust information</td>
                  <td>60.66 (5.03)</td>
                  <td colspan="2">0.08</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>HA10<sup>d</sup></td>
                  <td>Data protection against sharing: data, share, protection, data could, data protection, not share, data not, if data, share data, breach, system, sure data, thing, NHS, and bad</td>
                  <td>59.21 (4.91)</td>
                  <td colspan="2">0.14</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>HA12<sup>d</sup></td>
                  <td>Requires data security: security, data, concern, concern about, assurance, data security, safety, internet, information, about security, security information, use data, security would, assure, and matter</td>
                  <td>66.33 (5.50)</td>
                  <td colspan="2">0.06</td>
                </tr>
                <tr valign="top">
                  <td colspan="6">
                    <bold>Understand research purpose and process</bold>
                  </td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>HA6</td>
                  <td>Depends on assurances and purpose: depend, would depend, 100, 100 percent, percent, guarantee, depend how, depend information, depend use, depend why, give, depend if, depend purpose, percent guarantee, and how use</td>
                  <td>59.94 (4.97)</td>
                  <td colspan="2">0.32</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>HA4</td>
                  <td>Consent to specific research: research, happy, specific, permission, study, purpose, condition, time, project, if know, only if, researcher, advance, consent, and research project</td>
                  <td>58.37 (4.84)</td>
                  <td colspan="2">0.08</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>HA9<sup>d</sup></td>
                  <td>Understanding how and why data would be used: benefit, data, would use, understand, would need, don’t know, data would, how would, want know, clear, would want, purpose, give, understand why, and benefit would</td>
                  <td>60 (4.98)</td>
                  <td colspan="2">0.07</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>HA8<sup>d</sup></td>
                  <td>Requires more information: information, would need, more information, detail, need more, need know, information would, would want, know more, why would, information use, anonymous, more about, more detail, and will use</td>
                  <td>72.84 (6.04)</td>
                  <td colspan="2">0.11</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>HA1<sup>d</sup></td>
                  <td>Benefits to health: health, benefit, researcher, link, professional, care, interest, health professional, health researcher, if health, health care, information, individual, would benefit, and don’t see</td>
                  <td>66.21 (5.49)</td>
                  <td colspan="2">0.06</td>
                </tr>
                <tr valign="top">
                  <td colspan="5">
                    <bold>Same answer as question 4 (store loyalty card question)</bold>
                  </td>
                  <td>
                    <break/>
                  </td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>HA13</td>
                  <td>Same answer as Q4: q4, answer, 4, question, previous, question 4, answer q4, see answer, response, previous answer, see previous, answer 4, answer question, response q4, and affect</td>
                  <td>54.15 (4.49)</td>
                  <td colspan="2">0.12</td>
                </tr>
              </tbody>
            </table>
            <table-wrap-foot>
              <fn id="table4fn1">
                <p><sup>a</sup>HA: health app.</p>
              </fn>
              <fn id="table4fn2">
                <p><sup>b</sup>Total prevalence does not sum exactly to 100%, and the total survey response counts do not sum exactly to N because of rounding.</p>
              </fn>
              <fn id="table4fn3">
                <p><sup>c</sup>NHS: National Health Service.</p>
              </fn>
              <fn id="table4fn4">
                <p><sup>d</sup>Topic has been regrouped to the category most thematically aligned with its contents from the category hierarchical clustering indicated as semantically similar.</p>
              </fn>
            </table-wrap-foot>
          </table-wrap>
        </sec>
        <sec>
          <title>Making Recommendations</title>
          <p>The recommended actions that researchers can take to address the key barriers to data sharing and linkage identified through LDA modeling are presented. These recommendations are based on the synthesis of participant suggestions and expertise regarding wider research on data sharing.</p>
        </sec>
      </sec>
    </sec>
    <sec sec-type="results">
      <title>Results</title>
      <sec>
        <title>Data Collection</title>
        <p>The <italic>LifeInfo Survey</italic> recruited 7101 participants. The primary results of this study are reported elsewhere [<xref ref-type="bibr" rid="ref28">28</xref>]. In brief, of those who reported using the services, 51.50% (2521/4895) responded favorably to sharing their loyalty card data for linkage to health records and 70.80% (1717/2425) responded favorably to sharing data from health and fitness apps or wearable devices to link with their health records. For the store loyalty card question, 62.28% (1489/2391) of respondents who answered <italic>no</italic> to whether they would share their data for linkage provided a free-text response. Of those who answered <italic>not sure</italic>, 66.02% (814/1233) provided a response. For the health and fitness app question, 50.82% (839/1651) of respondents who answered <italic>no</italic> to whether they would share their data for linkage provided a free-text response. Of those who answered <italic>not sure</italic>, 56.8% (565/995) provided a response.</p>
        <p>A number of respondents who had either answered <italic>yes</italic> or did not provide an answer to whether they would share their data provided free-text responses that were included in the analysis (n=35 for the store loyalty card question and n=127 for the health and fitness app question). In total, 2338 individuals provided a free-text response to the store loyalty card question and 1531 for the health and fitness app question. Preprocessing steps and removing single-word responses reduced the number of responses to 1930 for the store loyalty card question and 1206 for the health and fitness app question. Single-word responses were considered separately and are shown in <xref ref-type="table" rid="table1">Tables 1</xref> and <xref ref-type="table" rid="table2">2</xref>.</p>
      </sec>
      <sec>
        <title>Health and Fitness Modeling</title>
        <sec>
          <title>Topic Number Selection</title>
          <p>For the store loyalty card question, perplexity scores for each model of candidate number of topics (k) indicated that the best model has a number of topics within the range of k=20 to k=60, as the perplexity scores plateau at their minimum value within this range (<xref rid="figure1" ref-type="fig">Figure 1</xref>). For the health and fitness app question, perplexity scores were minimized to within the topic number range of 20-30 topics (<xref rid="figure2" ref-type="fig">Figure 2</xref>). The average probabilistic coherence scores within these ranges of k varied by only very small amounts, indicating that the models had near-equivalent topic quality (<xref rid="figure3" ref-type="fig">Figures 3</xref> and <xref rid="figure4" ref-type="fig">4</xref>). To maximize human interpretation and for comparability between the 2 questions, parsimonious models of 20 topics were chosen for both the store loyalty card question and health and fitness app question.</p>
          <fig id="figure1" position="float">
            <label>Figure 1</label>
            <caption>
              <p>Perplexity scores for the models—LifeInfo store loyalty card question, 10-fold cross-validation of topic modelling to establish the optimal number of topics for latent Dirichlet allocation.</p>
            </caption>
            <graphic xlink:href="jmir_v23i5e24236_fig1.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </fig>
          <fig id="figure2" position="float">
            <label>Figure 2</label>
            <caption>
              <p>Perplexity scores for the models—LifeInfo health and fitness app question, 10-fold cross-validation of topic modelling to establish the optimal number of topics for latent Dirichlet allocation.</p>
            </caption>
            <graphic xlink:href="jmir_v23i5e24236_fig2.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </fig>
          <fig id="figure3" position="float">
            <label>Figure 3</label>
            <caption>
              <p>Average probabilistic coherence scores for the models—LifeInfo store loyalty card question, 10-fold cross-validation of topic modelling to establish the optimal number of topics for latent Dirichlet allocation.</p>
            </caption>
            <graphic xlink:href="jmir_v23i5e24236_fig3.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </fig>
          <fig id="figure4" position="float">
            <label>Figure 4</label>
            <caption>
              <p>Average probabilistic coherence scores for the models—LifeInfo health and fitness app question, 10-fold cross-validation of topic modelling to establish the optimal number of topics for latent Dirichlet allocation.</p>
            </caption>
            <graphic xlink:href="jmir_v23i5e24236_fig4.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </fig>
        </sec>
        <sec>
          <title>LDA Topics</title>
          <p><xref ref-type="table" rid="table3">Table 3</xref> shows the 20 topics created by the final LDA model of free-text responses to the store loyalty card question. The table includes the 15 top terms most representative of each topic, selected according to their ϕ values. As ϕ values are the probability of a term being used within a text given the topic, those terms with the highest ϕ values are representative of the topic content. The table includes 2 further topic measures: first, topic prevalence, which indicates how common each topic was within the survey responses (eg, prevalence 5.0 indicates 5% of survey responses fell into this topic), and second, probabilistic coherence, as an indicator of the semantic meaningfulness of each topic (this can range from 0 to 1, where higher numbers are more meaningful). Lower values of probabilistic coherence are owing to lower frequencies of topic term co-occurrences within texts and/or topic terms being highly frequent within the data set at large. Low coherence scores are associated with conceptually less defined topics [<xref ref-type="bibr" rid="ref40">40</xref>]; however, they could also be caused by <italic>fuzzy</italic> topics with many different terms or semantic crossover within other topics. <xref ref-type="table" rid="table4">Table 4</xref> shows the same information for the 20 topics created by LDA modeling of the health and fitness app question responses.</p>
          <p>LDA modeling also assigns survey responses with probabilities for each topic, known as <italic>θ values</italic> A higher θ value for any given topic indicates that the response should more likely be categorized into that topic (<xref ref-type="supplementary-material" rid="app5">Multimedia Appendix 5</xref> shows example responses most associated with each topic). Some responses, especially those that are very short (consisting of only 1 substantive term), are given equal θ values across all topics, which can be regarded as <italic>uncategorizable</italic> (<xref ref-type="supplementary-material" rid="app6">Multimedia Appendix 6</xref>). There are some differences in LDA topic frequency depending on demographic groups (eg, age and gender); further analysis demonstrating demographic patterns can be found in <xref ref-type="supplementary-material" rid="app7">Multimedia Appendix 7</xref>. Topic stability is validated by comparing results across multiple LDA runs (<xref ref-type="supplementary-material" rid="app8">Multimedia Appendix 8</xref>), indicating that the thematic categories reported are consistently produced in topics created through LDA.</p>
        </sec>
      </sec>
    </sec>
    <sec sec-type="discussion">
      <title>Discussion</title>
      <sec>
        <title>Principal Findings</title>
        <p>Understanding attitudes toward using big lifestyle data for health research is important for the success of research initiatives interested in using these data in the future. More than half of our participants who generated big lifestyle data reported that they would be happy for these data to be linked to health records for future research. As only individuals who stated that they were unwilling or unsure about sharing their lifestyle data were prompted to respond to what would make them change their mind, these topics identified specific barriers to data sharing. Topic modeling on survey responses produced thematic topics that summarized latent themes of concern to potential data subjects. We believe that the intelligence generated will support researchers in addressing these issues in the future with the appropriate use of safeguards and consent procedures to generate a publicly acceptable study design.</p>
        <p>It is also worth noting that, although LDA modeling aims to create distinct topics, individual responses may discuss multiple topics, and many of the topics identified were interconnected and complementary. In addition, topic quality varies and can be inferred by topic probabilistic coherence scores, which indicate how clearly defined each topic is semantically. For example, topic HA2 (<xref ref-type="table" rid="table4">Table 4</xref>) had a low value of probabilistic coherence (0.05) and primarily focused on the issue of privacy invasion; however, it also discussed the safety and security of data.</p>
      </sec>
      <sec>
        <title>Barriers to Data Linkage</title>
        <p>The topics uncovered by LDA modeling indicated that many of the same issues arise for both sharing store loyalty cards and health and fitness app data. Many topics can be matched across <xref ref-type="table" rid="table3">Tables 3</xref> and <xref ref-type="table" rid="table4">4</xref> or highlight similar themes. Overall, key barriers to the use and linkage of store loyalty cards and physical activity data for health research included data safety and security, personal privacy, the need for further understanding about the research and study purpose, fear that data could get into the <italic>wrong hands</italic>, problems with data accuracy, and not understanding the reason for data linkage. These barriers can potentially be addressed by researchers with varying degrees of ease. However, for some respondents, nothing would make them share these data, whereas others did not use store loyalty cards or health and fitness apps. Many of these issues are common in the literature on health, consumer, and personal data sharing, and as such, these are expected findings; however, new concerns also arise specific to individual data linkages. Example responses most associated with each topic are used throughout this discussion and are labeled with their relevant topic. These responses were selected as those with the highest probabilities of being categorized into a given topic and can be viewed in <xref ref-type="supplementary-material" rid="app5">Multimedia Appendix 5</xref>.</p>
        <sec>
          <title>Nothing Would Change My Mind</title>
          <p>Among those who responded negatively to sharing their lifestyle data for health research, a large proportion would be unwilling to change their mind. This is indicated first by the number of texts that LDA modeling categorized as the topic <italic>nothing would change mind</italic>, which refers to SC14 and HA17, constituting approximately 6.94% (134/1930) and 6.30% (76/1206) of the analyzed responses, respectively, and second, by the single-word answers excluded from the analysis, which were mainly the words <italic>nothing</italic>, <italic>no</italic>, and <italic>none</italic> for both the store loyalty card question and the health and fitness app question (<xref ref-type="table" rid="table1">Tables 1</xref> and <xref ref-type="table" rid="table2">2</xref>). These 3 single-word answers combined account for 90.3% (361/396) of the single-word responses about store loyalty cards and 85.8% (265/309) of single-word responses about health and fitness apps.</p>
        </sec>
        <sec>
          <title>“Don’t Use Services”</title>
          <p>Others mentioned that they did not use these services and thus would not have data to share. This was most clearly identified within the topics related to health and fitness apps (HA16 and HA11) and was evident for store loyalty cards (SC16). For example:</p>
          <disp-quote>
            <p>I’m not going to use a wearable device in the future.</p>
            <attrib>HA16</attrib>
          </disp-quote>
          <disp-quote>
            <p>Currently don’t use store loyalty cards.</p>
            <attrib>SC16</attrib>
          </disp-quote>
          <p>For this group, only greater participation in big lifestyle data production would allow them to share their data.</p>
        </sec>
        <sec>
          <title>“Big Brother” and “Privacy Invasion”</title>
          <p>Some respondents reported feeling that the proposed data linkage was a <italic>big brother</italic> and an invasion of privacy (SC8, SC1, HA2, and HA3). However, for the fitness app question, these topics were less clearly defined by LDA modeling, reflected in their low probabilistic coherence scores for these topics (0.05 and 0.06). Answers such as “not interested to a ‘big brother is watching’ on all aspects of my life” (SC8) and “[I] wouldn’t want to feel every area of my life is out of my control and being watched by an institution that already makes me feel like I have no autonomy” (HA3) indicate a dislike and feeling of <italic>surveillance</italic> through the data. Actions by researchers to address these feelings are limited as they are related to a broader distrust of big data; however, greater transparency as to how data are being used and assurance of anonymity may convince some users. Within the responses about store loyalty cards, SC1 specifically identified privacy concerns related to unwanted emails, phone calls, and text that can be addressed through data protection and security actions.</p>
        </sec>
        <sec>
          <title>Personal Data and Linkage</title>
          <p>Many responses focused on concerns about sharing these data, which respondents perceived as highly personal information (HA20, HA14, SC19, and SC3). References to data being personal and confidential appeared across topic categories, particularly in the context of not wanting to store loyalty card data and health records linked, data protection and security, and concerns about who is able to access these data. For example:</p>
          <disp-quote>
            <p>Nothing, this information is for my personal use. Access to my private devices can lead to security risks.</p>
            <attrib>HA14</attrib>
          </disp-quote>
          <p>The degree to which an individual believes that their data are personal and sensitive influences their willingness to share data and with whom. Medical information is personal, particularly for individuals with complex health conditions. Health and fitness app data and store loyalty card data are personal in different ways. Fitness apps are primarily used for personal monitoring, meaning data are not created with the idea that they might be shared with other actors, whereas for store loyalty card data, individuals exchange information about purchase history with shops in exchange for discounts and points. However, in the case of transaction data, Skatova et al [<xref ref-type="bibr" rid="ref13">13</xref>] found that people regarded the graduality of transactional data as personal.</p>
        </sec>
        <sec>
          <title>“Who Has Access to Private Data?”</title>
          <p>Health data are regarded as particularly sensitive and confidential [<xref ref-type="bibr" rid="ref26">26</xref>], related to the theme of <italic>private information</italic>, and many responses mentioned worries that data would be accessed by other actors without their permission (SC2, HA15, and HA7). For example, “if the information was available only to the research team, and not to others, e.g. insurance companies, mortgage companies, even the medical team” (HA15).</p>
          <p>Private companies, third parties, and health insurance companies were frequently mentioned by respondents as actors they feared would gain access to their data. This is expected, as research has found that these institutions are trusted least by the public to use data appropriately [<xref ref-type="bibr" rid="ref24">24</xref>,<xref ref-type="bibr" rid="ref27">27</xref>]. Many respondents believed these companies would use their data for profit, to increase the cost of premiums, or to deny treatment altogether. However, respondents also mentioned concerns that health care professionals would be able to access their lifestyle data. For example:</p>
          <disp-quote>
            <p>It’s my information for me. If I want my doc to know it I’ll tell him/her or put it on my health record myself.</p>
            <attrib>HA7</attrib>
          </disp-quote>
          <p>This finding is less expected, as research finds that health care providers are one of the most trusted actors for data sharing [<xref ref-type="bibr" rid="ref24">24</xref>,<xref ref-type="bibr" rid="ref27">27</xref>]. This indicates that there is no straightforward relationship between an individual’s willingness to share their data and their trust in the actors involved. A further barrier for data linkage is illuminated in that data are often created for specific purposes, and alternative uses of data outside of this domain can create suspicion.</p>
          <p>There was also a common misunderstanding that sharing lifestyle data for research would allow all involved actors to access these data. This was reflected in the previous example and others. For example, one respondent asked, “Why would I want Tesco knowing my health records?” (SC17).</p>
          <p>Similar findings were reported by Skatova et al [<xref ref-type="bibr" rid="ref13">13</xref>] in their research on transaction data sharing. This indicates that one easily achievable action that could influence people to share their data would be to explicitly state that data would not be available to anyone but researchers.</p>
          <p>Closely related to data access by others was a reported general belief that health records should not be linked with other data and should be accessed only by health professionals (SC17 and SC11). This again reflects the belief that data should only be used for its designated purpose. For example, one respondent felt that “Health records should be kept in health care” (SC17).</p>
        </sec>
        <sec>
          <title>Data Accuracy</title>
          <p>Data accuracy was another identified barrier. Many respondents indicated that their purchase history or fitness tracking was only partial, creating misleading data about lifestyle behaviors (SC7 and HA5). Indeed, missingness and data integrity have been identified as a challenge for research using big lifestyle data and a concern for participants [<xref ref-type="bibr" rid="ref10">10</xref>,<xref ref-type="bibr" rid="ref26">26</xref>]. Nevertheless, good results have been found by comparing or combining these data with more traditional collection forms, for example, modeling individual consumption from household-level data [<xref ref-type="bibr" rid="ref10">10</xref>,<xref ref-type="bibr" rid="ref46">46</xref>].</p>
          <p>When looking at responses within this topic, most participants reported concerns that their data would make them appear <italic>less</italic> healthy than their real activity. For example:</p>
          <disp-quote>
            <p>I buy all my fruit and veg at a farm shop...my data would provide misleading associations.</p>
            <attrib>SC7</attrib>
          </disp-quote>
          <disp-quote>
            <p>[my mobile data] shows a terrible step count, but that’s because I don’t hold my phone while playing netball, long walks etc.</p>
            <attrib>HA5</attrib>
          </disp-quote>
          <p>Implicitly, respondents worried that they would be judged unfavorably on their lifestyle behaviors. Researchers could address these concerns by making explicit in the study protocol that (1) data are not expected to be complete, (2) detailed actions they will take to accommodate for this with modeling techniques or additional surveying, and (3) all data would be made unidentifiable foreclosing the possibility of judgment.</p>
        </sec>
        <sec>
          <title>Data Protection and Security</title>
          <p>Topics that focused on data protection and security formulated a large proportion of the responses (approximately 17.31% (334/1930) of responses for the store loyalty card question and 19.90% (240/1206) for the health and fitness app question). Changes frequently mentioned were assurances that these data would be stored completely safely and securely and that they would be protected from hacks and data breaches. These are common concerns for data sharing, especially with data that are regarded particularly private or sensitive, such as health data [<xref ref-type="bibr" rid="ref26">26</xref>]. Although these assurances were given as a condition for data sharing within the question wording (<xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>), many responses highlighted their importance or were not convinced that this would happen. Action would, therefore, need to be taken to give participants greater confidence, which could be achieved through transparency in the research process and providing details of how these data will be protected.</p>
        </sec>
        <sec>
          <title>Understanding More About the Research Purpose, Process, and Benefit</title>
          <p>Across both store loyalty cards and health and fitness app data sharing, respondents mentioned that understanding the research better and being given more control over their participation would influence them to change their mind. This includes being provided with more information (SC18 and HA8), giving permission only for specific research projects (SC4 and HA4), and a greater understanding of the reason or benefits of research (SC20, SC4, SC6, HA9, and HA1). These findings are supported by the research of Skatova et al [<xref ref-type="bibr" rid="ref13">13</xref>], who found that support for data sharing is contingent on its context and purpose, highlighting the importance of well-informed participants. Given that the <italic>LifeInfo Survey</italic> aimed to assess attitudes toward data sharing in the future, rather than requesting participants to consent to data sharing at this time, the participant information sheet (<xref ref-type="supplementary-material" rid="app9">Multimedia Appendix 9</xref>) needed to be suitably broad, which would not be the case when recruiting participants to an actual data linkage study.</p>
          <p>Researchers have found that a strong social motive, such as improvements to health or treatment, in addition to clearly defining the purpose of research, are key motivating factors for personal data donation [<xref ref-type="bibr" rid="ref47">47</xref>], whereas using health data for insurance, marketing purposes, or commercial exploitation is unacceptable to the public [<xref ref-type="bibr" rid="ref23">23</xref>]. This was reflected in the unstructured answers from the <italic>LifeInfo Survey</italic>. A respondent answered that they would be supportive “if the data fed into important public health or similar research and was not used to further commercial gains by these giants of commerce” (HA4).</p>
          <p>Again, although LDA separates topics, they are connected, and the involvement of private companies creates concerns about whether these entities will be able to access health data for profit once linked.</p>
          <p>Control and consent for health data being used for research have been found to be key for public acceptance [<xref ref-type="bibr" rid="ref25">25</xref>]. A respondent answered that the data “has to be linked to a condition or specific research project with additional approval provided in advance” (HA2).</p>
          <p>This is something that can be adapted into the research process, allowing participants to consent to or deny the use of their data, given the specifications of the study.</p>
          <p>In addition to a desire for more understanding, another thematic category was identified of individuals who did not understand the reason or relevance of data linkage (SC13 and SC10). For example, one respondent said, “Do not see any reason why they should be linked” (SC10).</p>
          <p>These topics were only produced when modeling the store loyalty card question, although topic HA5 also encompassed responses that stated data are <italic>not relevant</italic>. This perhaps indicates that respondents found the link between purchasing and health to be less relevant than fitness tracking and health.</p>
        </sec>
      </sec>
      <sec>
        <title>Strengths and Limitations</title>
        <p>The application of LDA modeling has clear strengths, enabling semiautomated analysis of large text corpora to readily identify barriers for data sharing. Nonetheless, some limitations present themselves; the topics identified through LDA modeling may hide rarer topics that do not have highly frequent mentions or a homogenous lexicon; for example, some <italic>LifeInfo Survey</italic> responses mention financial compensation (eg, vouchers), but this is not identified as a topic. In addition, texts that are linked by similar terms but are thematically different are sometimes grouped by LDA modeling; for example, HA6 included responses that require <italic>100 percent</italic> reassurance of certain criteria. However, these criteria span several different issues. These more granular findings may be better identified by human qualitative coding; however, this comes with its own limitations, especially for large data sets.</p>
        <p>The <italic>LifeInfo Survey</italic> sample size was large, thereby facilitating the identification of important topics; however, the size of this data set was smaller than those often used for topic modeling, and responses were relatively short. As previously mentioned, those texts that were extremely short were uncategorizable by the model, which is a limitation of this methodology and data. Similarly, the survey was designed to elicit responses only from those unwilling or unsure about sharing their data. This provided benefits as it focused on the scope of topics to identify key barriers; yet, it would be insightful to obtain the opinions of those more supportive of lifestyle data sharing initiatives (which was 52.30% (2521/4820) of loyalty card holders and 70.80% (1717/2425) of health and fitness app users in our study), which should be considered in future studies. In examples where positive and negative responses are captured, it would be useful to explore a sentiment analysis approach to text mining; however, this was not relevant for our study.</p>
        <p>Research has found LDA model results to be sensitive to model hyperparameters [<xref ref-type="bibr" rid="ref44">44</xref>], and it is possible to use methods that optimize LDA across different α and β values. These methods were not applied in this study as trialing them increased the computational intensity of the analysis and did not provide better solutions.</p>
        <p>Due to resource limitations, the <italic>LifeInfo Survey</italic> questionnaires were only available in English. This means that we are unlikely to have reached the &#60;2% of the population who are not able to speak English. However, the <italic>LifeInfo Survey</italic> is overrepresented in the traditionally hard-to-reach, most deprived communities and Asian and other ethnicities [<xref ref-type="bibr" rid="ref28">28</xref>].</p>
      </sec>
      <sec>
        <title>Recommended Actions</title>
        <p>Several actions can be taken by researchers to directly address the key barriers to data sharing identified through LDA modeling, a summary of which is detailed in <xref ref-type="boxed-text" rid="box1">Textbox 1</xref>.</p>
        <boxed-text id="box1" position="float">
          <title>Summary of recommendations to improve support for the linkage of novel consumer and lifestyle data with health records for research purposes.</title>
          <p>Motivation</p>
          <list list-type="bullet">
            <list-item>
              <p>Provide detailed and specific information about the study purpose and benefit.</p>
            </list-item>
          </list>
          <p>Control and consent</p>
          <list list-type="bullet">
            <list-item>
              <p>Provide detailed information about research and specific opt-in mechanisms to give participants more control.</p>
            </list-item>
          </list>
          <p>Access by others</p>
          <list list-type="bullet">
            <list-item>
              <p>Provide an explicit statement that data linkage does not give all parties access to linked data. Lifestyle data will not be shared with health services, and health records will not be shared with supermarkets or technology companies.</p>
            </list-item>
          </list>
          <p>Third-party access</p>
          <list list-type="bullet">
            <list-item>
              <p>Provide reassurances that data will not be shared with third parties, such as health insurers.</p>
            </list-item>
          </list>
          <p>Inaccurate data</p>
          <list list-type="bullet">
            <list-item>
              <p>Provide acknowledgment within the study specification that data might be partial and outline mechanisms for how this will be addressed, such as data quality checks, modeling techniques, and supplementary data collection.</p>
            </list-item>
          </list>
          <p>Non and/or infrequent use of services</p>
          <list list-type="bullet">
            <list-item>
              <p>Increase participation in novel data collection and more complete use.</p>
            </list-item>
          </list>
          <p>Data security and protection</p>
          <list list-type="bullet">
            <list-item>
              <p>Put in place stringent precautions to keep data protected from hacks or data breaches.</p>
            </list-item>
          </list>
          <p>Personal data</p>
          <list list-type="bullet">
            <list-item>
              <p>Provide assurances that data will be made anonymous and nonidentifiable.</p>
            </list-item>
          </list>
          <p>“Big Brother” and privacy invasion</p>
          <list list-type="bullet">
            <list-item>
              <p>Widely used good practice and exemplar studies, which provide a clear benefit to public health, and excellent data security could help increase trust in data sharing initiatives.</p>
            </list-item>
          </list>
        </boxed-text>
        <p>Findings and recommended actions incorporate some areas we hypothesized would emerge, for example, data security and protection, but are far more comprehensive and nuanced than the existing literature reflects.</p>
      </sec>
      <sec>
        <title>Future Work</title>
        <p>There is extensive scope to use LDA in future research, where free-text responses are collected in large surveys. LDA allows the detection of topics in free-text responses to be generated at a scale that is not feasible in more traditional qualitative thematic methods. Other text mining approaches, such as sentiment analysis, may be applicable where identification of positive and negative responses is important.</p>
        <p>More work is needed to be able to further unpick concerns within topics such as <italic>who has access to data</italic>, where this could be context-dependent, for example, patients with a complex medical history who are worried about being judged on their lifestyle behaviors by their clinical team, and <italic>big brother</italic>, where it is clear that greater transparency is required regarding data being used for research purposes.</p>
        <p>Future research could explore the utility of encouraging patients or research participants to use store loyalty cards and health and fitness apps or wearables as part of their personalized care, extending research that is already being done. This is of particular significance for those relating to the <italic>don’t use services</italic> topic.</p>
        <p>There has been significant interest in the use of supermarket loyalty cards and health and fitness app data in health research, in addition to the greater availability of these data in recent years. This provides exciting opportunities to gain new insights into lifestyle risk factors for diseases through individual-linked data. The growth of this research requires that the common concerns of participants regarding ethics, data security, research aims, and personal privacy, among others, are understood so that they can be addressed by future projects. Researchers may use the findings and recommended actions shared in this paper so that greater trust can be built in practices of data linkage for health research.</p>
      </sec>
      <sec>
        <title>Conclusions</title>
        <p>Analysis of the <italic>LifeInfo Survey</italic> responses with topic modeling techniques revealed key barriers that prevent people from willingly sharing their novel lifestyle data for health research. This large-scale public consultation provides actionable recommendations that will allow researchers using big lifestyle data to adapt their study design and provide safeguards based on expressed concerns important to the general public that are specific to novel lifestyle data and health record linkage.</p>
      </sec>
    </sec>
  </body>
  <back>
    <app-group>
      <supplementary-material id="app1">
        <label>Multimedia Appendix 1</label>
        <p>LifeInfo questionnaire.</p>
        <media xlink:href="jmir_v23i5e24236_app1.pdf" xlink:title="PDF File  (Adobe PDF File), 217 KB"/>
      </supplementary-material>
      <supplementary-material id="app2">
        <label>Multimedia Appendix 2</label>
        <p>Supplementary information regarding data cleaning and processing steps.</p>
        <media xlink:href="jmir_v23i5e24236_app2.docx" xlink:title="DOCX File , 14 KB"/>
      </supplementary-material>
      <supplementary-material id="app3">
        <label>Multimedia Appendix 3</label>
        <p>Question response lengths before and after data processing.</p>
        <media xlink:href="jmir_v23i5e24236_app3.docx" xlink:title="DOCX File , 28 KB"/>
      </supplementary-material>
      <supplementary-material id="app4">
        <label>Multimedia Appendix 4</label>
        <p>Dendrograms of hierarchical clustering of topics produced from latent Dirichlet allocation models.</p>
        <media xlink:href="jmir_v23i5e24236_app4.docx" xlink:title="DOCX File , 91 KB"/>
      </supplementary-material>
      <supplementary-material id="app5">
        <label>Multimedia Appendix 5</label>
        <p>Tables showing the 20 topics produced by latent Dirichlet allocation modeling and the top 5 words per topic according to the response θ value.</p>
        <media xlink:href="jmir_v23i5e24236_app5.docx" xlink:title="DOCX File , 35 KB"/>
      </supplementary-material>
      <supplementary-material id="app6">
        <label>Multimedia Appendix 6</label>
        <p>Summary of texts that are not categorizable into any specific topic by the latent Dirichlet allocation model.</p>
        <media xlink:href="jmir_v23i5e24236_app6.docx" xlink:title="DOCX File , 13 KB"/>
      </supplementary-material>
      <supplementary-material id="app7">
        <label>Multimedia Appendix 7</label>
        <p>Graphs showing mean topic prevalence and standard error bars broken down by demographic groups.</p>
        <media xlink:href="jmir_v23i5e24236_app7.docx" xlink:title="DOCX File , 74 KB"/>
      </supplementary-material>
      <supplementary-material id="app8">
        <label>Multimedia Appendix 8</label>
        <p>Coded tables of multiple latent Dirichlet allocation outputs to test topic stability.</p>
        <media xlink:href="jmir_v23i5e24236_app8.docx" xlink:title="DOCX File , 43 KB"/>
      </supplementary-material>
      <supplementary-material id="app9">
        <label>Multimedia Appendix 9</label>
        <p>Participant information sheet.</p>
        <media xlink:href="jmir_v23i5e24236_app9.pdf" xlink:title="PDF File  (Adobe PDF File), 271 KB"/>
      </supplementary-material>
    </app-group>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">DTM</term>
          <def>
            <p>document term matrix</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb2">LDA</term>
          <def>
            <p>latent Dirichlet allocation</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <ack>
      <p>The authors would like to thank all the LifeInfo study participants. The authors would also like to thank the research teams members at Leeds Teaching Hospitals National Health Service Trust, Kinga Dwornik, Patric Devitt, Linda Bamford, Ian Thompson, Amanda Friend, Linda Roberts, Richard Evans, and Laura Kelly; Leeds Institute for Data Analytics colleagues, Kimberley Wright, Hayley Irving, Chris Carrigan, and Maria Galazoula; Low Moor Medical Practice colleagues, Mark Cade and Maureen Rowland; Leeds City Council colleague, Thomas Woolley; and the UseMyData group. The LifeInfo study has been supported by the National Institute for Health Research Clinical Research Network through portfolio adoption, the Consumer Data Research Centre grant ref:ES/L011891/1, the Medical Bioinformatics Centre ref: MR/L01629X/ and the School of Medicine, University of Leeds, through their Academic Development Fund.</p>
    </ack>
    <fn-group>
      <fn fn-type="conflict">
        <p>MM is an inventor and shareholder at Dietary Assessment Ltd.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Afshin</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Sur</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Fay</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Cornaby</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Ferrara</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Salama</surname>
              <given-names>JS</given-names>
            </name>
            <name name-style="western">
              <surname>Mullany</surname>
              <given-names>EC</given-names>
            </name>
            <name name-style="western">
              <surname>Abate</surname>
              <given-names>KH</given-names>
            </name>
            <name name-style="western">
              <surname>Abbafati</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Abebe</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Afarideh</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Aggarwal</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Agrawal</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Akinyemiju</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Alahdab</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Bacha</surname>
              <given-names>U</given-names>
            </name>
            <name name-style="western">
              <surname>Bachman</surname>
              <given-names>VF</given-names>
            </name>
            <name name-style="western">
              <surname>Badali</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Badawi</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Bensenor</surname>
              <given-names>IM</given-names>
            </name>
            <name name-style="western">
              <surname>Bernabe</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Biadgilign</surname>
              <given-names>SK</given-names>
            </name>
            <name name-style="western">
              <surname>Biryukov</surname>
              <given-names>SH</given-names>
            </name>
            <name name-style="western">
              <surname>Cahill</surname>
              <given-names>LE</given-names>
            </name>
            <name name-style="western">
              <surname>Carrero</surname>
              <given-names>JJ</given-names>
            </name>
            <name name-style="western">
              <surname>Cercy</surname>
              <given-names>KM</given-names>
            </name>
            <name name-style="western">
              <surname>Dandona</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Dandona</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Dang</surname>
              <given-names>AK</given-names>
            </name>
            <name name-style="western">
              <surname>Degefa</surname>
              <given-names>MG</given-names>
            </name>
            <name name-style="western">
              <surname>Zaki</surname>
              <given-names>ME</given-names>
            </name>
            <name name-style="western">
              <surname>Esteghamati</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Esteghamati</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Fanzo</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Farinha</surname>
              <given-names>CS</given-names>
            </name>
            <name name-style="western">
              <surname>Farvid</surname>
              <given-names>MS</given-names>
            </name>
            <name name-style="western">
              <surname>Farzadfar</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Feigin</surname>
              <given-names>Vl</given-names>
            </name>
            <name name-style="western">
              <surname>Fernandes</surname>
              <given-names>JC</given-names>
            </name>
            <name name-style="western">
              <surname>Flor</surname>
              <given-names>LS</given-names>
            </name>
            <name name-style="western">
              <surname>Foigt</surname>
              <given-names>NA</given-names>
            </name>
            <name name-style="western">
              <surname>Forouzanfar</surname>
              <given-names>MH</given-names>
            </name>
            <name name-style="western">
              <surname>Ganji</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Geleijnse</surname>
              <given-names>JM</given-names>
            </name>
            <name name-style="western">
              <surname>Gillum</surname>
              <given-names>RF</given-names>
            </name>
            <name name-style="western">
              <surname>Goulart</surname>
              <given-names>AC</given-names>
            </name>
            <name name-style="western">
              <surname>Grosso</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Guessous</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Hamidi</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Hankey</surname>
              <given-names>GJ</given-names>
            </name>
            <name name-style="western">
              <surname>Harikrishnan</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Hassen</surname>
              <given-names>HY</given-names>
            </name>
            <name name-style="western">
              <surname>Hay</surname>
              <given-names>SI</given-names>
            </name>
            <name name-style="western">
              <surname>Hoang</surname>
              <given-names>CL</given-names>
            </name>
            <name name-style="western">
              <surname>Horino</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Islami</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Jackson</surname>
              <given-names>MD</given-names>
            </name>
            <name name-style="western">
              <surname>James</surname>
              <given-names>SL</given-names>
            </name>
            <name name-style="western">
              <surname>Johansson</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Jonas</surname>
              <given-names>JB</given-names>
            </name>
            <name name-style="western">
              <surname>Kasaeian</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Khader</surname>
              <given-names>YS</given-names>
            </name>
            <name name-style="western">
              <surname>Khalil</surname>
              <given-names>IA</given-names>
            </name>
            <name name-style="western">
              <surname>Khang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Kimokoti</surname>
              <given-names>RW</given-names>
            </name>
            <name name-style="western">
              <surname>Kokubo</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Kumar</surname>
              <given-names>GA</given-names>
            </name>
            <name name-style="western">
              <surname>Lallukka</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Lopez</surname>
              <given-names>AD</given-names>
            </name>
            <name name-style="western">
              <surname>Lorkowski</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Lotufo</surname>
              <given-names>PA</given-names>
            </name>
            <name name-style="western">
              <surname>Lozano</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Malekzadeh</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>März</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Meier</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Melaku</surname>
              <given-names>YA</given-names>
            </name>
            <name name-style="western">
              <surname>Mendoza</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Mensink</surname>
              <given-names>GB</given-names>
            </name>
            <name name-style="western">
              <surname>Micha</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Miller</surname>
              <given-names>TR</given-names>
            </name>
            <name name-style="western">
              <surname>Mirarefin</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Mohan</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Mokdad</surname>
              <given-names>AH</given-names>
            </name>
            <name name-style="western">
              <surname>Mozaffarian</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Nagel</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Naghavi</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Nguyen</surname>
              <given-names>CT</given-names>
            </name>
            <name name-style="western">
              <surname>Nixon</surname>
              <given-names>MR</given-names>
            </name>
            <name name-style="western">
              <surname>Ong</surname>
              <given-names>KL</given-names>
            </name>
            <name name-style="western">
              <surname>Pereira</surname>
              <given-names>DM</given-names>
            </name>
            <name name-style="western">
              <surname>Poustchi</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Qorbani</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Rai</surname>
              <given-names>RK</given-names>
            </name>
            <name name-style="western">
              <surname>Razo-García</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Rehm</surname>
              <given-names>CD</given-names>
            </name>
            <name name-style="western">
              <surname>Rivera</surname>
              <given-names>JA</given-names>
            </name>
            <name name-style="western">
              <surname>Rodríguez-Ramírez</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Roshandel</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Roth</surname>
              <given-names>GA</given-names>
            </name>
            <name name-style="western">
              <surname>Sanabria</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Sánchez-Pimienta</surname>
              <given-names>TG</given-names>
            </name>
            <name name-style="western">
              <surname>Sartorius</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Schmidhuber</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Schutte</surname>
              <given-names>AE</given-names>
            </name>
            <name name-style="western">
              <surname>Sepanlou</surname>
              <given-names>SG</given-names>
            </name>
            <name name-style="western">
              <surname>Shin</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Sorensen</surname>
              <given-names>RJ</given-names>
            </name>
            <name name-style="western">
              <surname>Springmann</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Szponar</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Thorne-Lyman</surname>
              <given-names>AL</given-names>
            </name>
            <name name-style="western">
              <surname>Thrift</surname>
              <given-names>AG</given-names>
            </name>
            <name name-style="western">
              <surname>Touvier</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Tran</surname>
              <given-names>BX</given-names>
            </name>
            <name name-style="western">
              <surname>Tyrovolas</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Ukwaja</surname>
              <given-names>KN</given-names>
            </name>
            <name name-style="western">
              <surname>Ullah</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Uthman</surname>
              <given-names>OA</given-names>
            </name>
            <name name-style="western">
              <surname>Vaezghasemi</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Vasankari</surname>
              <given-names>TJ</given-names>
            </name>
            <name name-style="western">
              <surname>Vollset</surname>
              <given-names>SE</given-names>
            </name>
            <name name-style="western">
              <surname>Vos</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Vu</surname>
              <given-names>GT</given-names>
            </name>
            <name name-style="western">
              <surname>Vu</surname>
              <given-names>LG</given-names>
            </name>
            <name name-style="western">
              <surname>Weiderpass</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Werdecker</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Wijeratne</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Willett</surname>
              <given-names>WC</given-names>
            </name>
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>JH</given-names>
            </name>
            <name name-style="western">
              <surname>Xu</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Yonemoto</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Yu</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Murray</surname>
              <given-names>CJ</given-names>
            </name>
          </person-group>
          <article-title>Health effects of dietary risks in 195 countries, 1990–2017: a systematic analysis for the Global Burden of Disease Study 2017</article-title>
          <source>Lancet</source>
          <year>2019</year>
          <month>05</month>
          <volume>393</volume>
          <issue>10184</issue>
          <fpage>1958</fpage>
          <lpage>72</lpage>
          <pub-id pub-id-type="doi">10.1016/S0140-6736(19)30041-8</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Shiroma</surname>
              <given-names>EJ</given-names>
            </name>
            <name name-style="western">
              <surname>Lobelo</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Puska</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Blair</surname>
              <given-names>SN</given-names>
            </name>
            <name name-style="western">
              <surname>Katzmarzyk</surname>
              <given-names>PT</given-names>
            </name>
          </person-group>
          <article-title>Effect of physical inactivity on major non-communicable diseases worldwide: an analysis of burden of disease and life expectancy</article-title>
          <source>Lancet</source>
          <year>2012</year>
          <month>07</month>
          <volume>380</volume>
          <issue>9838</issue>
          <fpage>219</fpage>
          <lpage>29</lpage>
          <pub-id pub-id-type="doi">10.1016/S0140-6736(12)61031-9</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="web">
          <article-title>Physical activity: applying all our health</article-title>
          <source>Public Health England</source>
          <year>2019</year>
          <access-date>2020-12-20</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.gov.uk/government/publications/physical-activity-applying-all-our-health/physical-activity-applying-all-our-health">https://www.gov.uk/government/publications/physical-activity-applying-all-our-health/physical-activity-applying-all-our-health</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="web">
          <article-title>The food system in data - Part One</article-title>
          <source>National Food Strategy</source>
          <access-date>2020-12-20</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.nationalfoodstrategy.org/partone/">https://www.nationalfoodstrategy.org/partone/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Timmins</surname>
              <given-names>KA</given-names>
            </name>
            <name name-style="western">
              <surname>Green</surname>
              <given-names>MA</given-names>
            </name>
            <name name-style="western">
              <surname>Radley</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Morris</surname>
              <given-names>MA</given-names>
            </name>
            <name name-style="western">
              <surname>Pearce</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>How has big data contributed to obesity research? A review of the literature</article-title>
          <source>Int J Obes (Lond)</source>
          <year>2018</year>
          <month>12</month>
          <day>18</day>
          <volume>42</volume>
          <issue>12</issue>
          <fpage>1951</fpage>
          <lpage>62</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/30022056"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/s41366-018-0153-7</pub-id>
          <pub-id pub-id-type="medline">30022056</pub-id>
          <pub-id pub-id-type="pii">10.1038/s41366-018-0153-7</pub-id>
          <pub-id pub-id-type="pmcid">PMC6291419</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref6">
        <label>6</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Morris</surname>
              <given-names>MA</given-names>
            </name>
            <name name-style="western">
              <surname>Wilkins</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Timmins</surname>
              <given-names>KA</given-names>
            </name>
            <name name-style="western">
              <surname>Bryant</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Birkin</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Griffiths</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>Can big data solve a big problem? Reporting the obesity data landscape in line with the Foresight obesity system map</article-title>
          <source>Int J Obes (Lond)</source>
          <year>2018</year>
          <month>12</month>
          <day>21</day>
          <volume>42</volume>
          <issue>12</issue>
          <fpage>1963</fpage>
          <lpage>76</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/30242238"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/s41366-018-0184-0</pub-id>
          <pub-id pub-id-type="medline">30242238</pub-id>
          <pub-id pub-id-type="pii">10.1038/s41366-018-0184-0</pub-id>
          <pub-id pub-id-type="pmcid">PMC6291418</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref7">
        <label>7</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Green</surname>
              <given-names>MA</given-names>
            </name>
            <name name-style="western">
              <surname>Watson</surname>
              <given-names>AW</given-names>
            </name>
            <name name-style="western">
              <surname>Brunstrom</surname>
              <given-names>JM</given-names>
            </name>
            <name name-style="western">
              <surname>Corfe</surname>
              <given-names>BM</given-names>
            </name>
            <name name-style="western">
              <surname>Johnstone</surname>
              <given-names>AM</given-names>
            </name>
            <name name-style="western">
              <surname>Williams</surname>
              <given-names>EA</given-names>
            </name>
            <name name-style="western">
              <surname>Stevenson</surname>
              <given-names>E</given-names>
            </name>
          </person-group>
          <article-title>Comparing supermarket loyalty card data with traditional diet survey data for understanding how protein is purchased and consumed in older adults for the UK, 2014-16</article-title>
          <source>Nutr J</source>
          <year>2020</year>
          <month>08</month>
          <day>13</day>
          <volume>19</volume>
          <issue>83</issue>
          <fpage>1</fpage>
          <lpage>10</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://nutritionj.biomedcentral.com/articles/10.1186/s12937-020-00602-3"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/s12937-020-00602-3</pub-id>
          <pub-id pub-id-type="medline">32791968</pub-id>
          <pub-id pub-id-type="pii">10.1186/s12937-020-00602-3</pub-id>
          <pub-id pub-id-type="pmcid">PMC7427066</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref8">
        <label>8</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Shephard</surname>
              <given-names>RJ</given-names>
            </name>
          </person-group>
          <article-title>Limits to the measurement of habitual physical activity by questionnaires</article-title>
          <source>Br J Sports Med</source>
          <year>2003</year>
          <month>06</month>
          <day>01</day>
          <volume>37</volume>
          <issue>3</issue>
          <fpage>197</fpage>
          <lpage>206</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://bjsm.bmj.com/lookup/pmidlookup?view=long&#38;pmid=12782543"/>
          </comment>
          <pub-id pub-id-type="doi">10.1136/bjsm.37.3.197</pub-id>
          <pub-id pub-id-type="medline">12782543</pub-id>
          <pub-id pub-id-type="pmcid">PMC1724653</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref9">
        <label>9</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Bidargaddi</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Musiat</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Makinen</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Ermes</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Schrader</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Licinio</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Digital footprints: facilitating large-scale environmental psychiatric research in naturalistic settings through data from everyday technologies</article-title>
          <source>Mol Psychiatry</source>
          <year>2017</year>
          <month>12</month>
          <volume>22</volume>
          <issue>2</issue>
          <fpage>164</fpage>
          <lpage>9</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/27922603"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/mp.2016.224</pub-id>
          <pub-id pub-id-type="medline">27922603</pub-id>
          <pub-id pub-id-type="pii">mp2016224</pub-id>
          <pub-id pub-id-type="pmcid">PMC5285463</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref10">
        <label>10</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Nevalainen</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Erkkola</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Saarijärvi</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Näppilä</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Fogelholm</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Large-scale loyalty card data in health research</article-title>
          <source>Digit Health</source>
          <year>2018</year>
          <month>11</month>
          <day>29</day>
          <volume>4</volume>
          <fpage>-</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://journals.sagepub.com/doi/10.1177/2055207618816898?url_ver=Z39.88-2003&#38;rfr_id=ori:rid:crossref.org&#38;rfr_dat=cr_pub%3dpubmed"/>
          </comment>
          <pub-id pub-id-type="doi">10.1177/2055207618816898</pub-id>
          <pub-id pub-id-type="medline">30546912</pub-id>
          <pub-id pub-id-type="pii">10.1177_2055207618816898</pub-id>
          <pub-id pub-id-type="pmcid">PMC6287323</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref11">
        <label>11</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Aiello</surname>
              <given-names>LM</given-names>
            </name>
            <name name-style="western">
              <surname>Schifanella</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Quercia</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Del Prete</surname>
              <given-names>L</given-names>
            </name>
          </person-group>
          <article-title>Large-scale and high-resolution analysis of food purchases and health outcomes</article-title>
          <source>EPJ Data Sci</source>
          <year>2019</year>
          <month>04</month>
          <day>30</day>
          <volume>8</volume>
          <issue>14</issue>
          <fpage>1</fpage>
          <lpage>22</lpage>
          <pub-id pub-id-type="doi">10.1140/epjds/s13688-019-0191-y</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref12">
        <label>12</label>
        <nlm-citation citation-type="web">
          <article-title>About the CDRC</article-title>
          <source>Consumer Data Research Centre</source>
          <access-date>2020-12-06</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.cdrc.ac.uk/about/">https://www.cdrc.ac.uk/about/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref13">
        <label>13</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Skatova</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Shiells</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Boyd</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Attitudes towards transactional data donation and linkage in a longitudinal population study: evidence from the Avon Longitudinal Study of Parents and Children</article-title>
          <source>Wellcome Open Res</source>
          <year>2019</year>
          <month>12</month>
          <day>3</day>
          <volume>4</volume>
          <fpage>192</fpage>
          <pub-id pub-id-type="doi">10.12688/wellcomeopenres.15557.1</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref14">
        <label>14</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Mamiya</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Moodie</surname>
              <given-names>EE</given-names>
            </name>
            <name name-style="western">
              <surname>Buckeridge</surname>
              <given-names>DL</given-names>
            </name>
          </person-group>
          <article-title>A novel application of point-of-sales grocery transaction data to enhance community nutrition monitoring</article-title>
          <source>AMIA Annu Symp Proc</source>
          <year>2017</year>
          <volume>2017</volume>
          <fpage>1253</fpage>
          <lpage>61</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/29854194"/>
          </comment>
          <pub-id pub-id-type="medline">29854194</pub-id>
          <pub-id pub-id-type="pmcid">PMC5977589</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref15">
        <label>15</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Clark</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Shute</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Jenneson</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Rains</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Morris</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Compliance with the Eatwell guide: a case study using supermarket transaction records in Yorkshire and the Humber</article-title>
          <source>Proc Nut. Soc</source>
          <year>2020</year>
          <month>06</month>
          <day>10</day>
          <volume>79</volume>
          <issue>OCE2</issue>
          <fpage>e665</fpage>
          <pub-id pub-id-type="doi">10.1017/S002966512000614X</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref16">
        <label>16</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Jenneson</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Shute</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Greenwood</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Clarke</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Clark</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Rains</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Morris</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Variation in fruit and vegetable purchasing patterns in Leeds: using novel loyalty card transaction data</article-title>
          <source>Proc Nutr Soc</source>
          <year>2020</year>
          <month>06</month>
          <day>10</day>
          <volume>79</volume>
          <issue>OCE2</issue>
          <fpage>e670</fpage>
          <pub-id pub-id-type="doi">10.1017/S0029665120006199</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref17">
        <label>17</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Gay</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Leijdekkers</surname>
              <given-names>P</given-names>
            </name>
          </person-group>
          <article-title>Bringing health and fitness data together for connected health care: mobile apps as enablers of interoperability</article-title>
          <source>J Med Internet Res</source>
          <year>2015</year>
          <month>11</month>
          <day>18</day>
          <volume>17</volume>
          <issue>11</issue>
          <fpage>e260</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://www.jmir.org/2015/11/e260/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/jmir.5094</pub-id>
          <pub-id pub-id-type="medline">26581920</pub-id>
          <pub-id pub-id-type="pii">v17i11e260</pub-id>
          <pub-id pub-id-type="pmcid">PMC4704968</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref18">
        <label>18</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Brinton</surname>
              <given-names>JE</given-names>
            </name>
            <name name-style="western">
              <surname>Keating</surname>
              <given-names>MD</given-names>
            </name>
            <name name-style="western">
              <surname>Ortiz</surname>
              <given-names>AM</given-names>
            </name>
            <name name-style="western">
              <surname>Evenson</surname>
              <given-names>KR</given-names>
            </name>
            <name name-style="western">
              <surname>Furberg</surname>
              <given-names>RD</given-names>
            </name>
          </person-group>
          <article-title>Establishing linkages between distributed survey responses and consumer wearable device datasets: a pilot protocol</article-title>
          <source>JMIR Res Protoc</source>
          <year>2017</year>
          <month>04</month>
          <day>27</day>
          <volume>6</volume>
          <issue>4</issue>
          <fpage>e66</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.researchprotocols.org/2017/4/e66/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/resprot.6513</pub-id>
          <pub-id pub-id-type="medline">28450274</pub-id>
          <pub-id pub-id-type="pii">v6i4e66</pub-id>
          <pub-id pub-id-type="pmcid">PMC5427248</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref19">
        <label>19</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hicks</surname>
              <given-names>JL</given-names>
            </name>
            <name name-style="western">
              <surname>Althoff</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Sosic</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Kuhar</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Bostjancic</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>King</surname>
              <given-names>AC</given-names>
            </name>
            <name name-style="western">
              <surname>Leskovec</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Delp</surname>
              <given-names>SL</given-names>
            </name>
          </person-group>
          <article-title>Best practices for analyzing large-scale health data from wearables and smartphone apps</article-title>
          <source>npj Digit. Med</source>
          <year>2019</year>
          <month>6</month>
          <day>3</day>
          <volume>2</volume>
          <issue>1</issue>
          <fpage>1</fpage>
          <lpage>12</lpage>
          <pub-id pub-id-type="doi">10.1038/s41746-019-0121-1</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref20">
        <label>20</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <collab>Minerva</collab>
          </person-group>
          <article-title>Problems with care.data and other stories</article-title>
          <source>Br Med J</source>
          <year>2015</year>
          <volume>351</volume>
          <fpage>-</fpage>
          <pub-id pub-id-type="doi">10.1136/bmj.h4613</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref21">
        <label>21</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Sweeney</surname>
              <given-names>L</given-names>
            </name>
          </person-group>
          <article-title>Simple Demographics Often Identify People Uniquely</article-title>
          <source>Carnegie Mellon University</source>
          <year>2000</year>
          <access-date>2021-03-04</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://dataprivacylab.org/projects/identifiability/paper1.pdf">http://dataprivacylab.org/projects/identifiability/paper1.pdf</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref22">
        <label>22</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ritchie</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Smith</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Confidentiality and linked data</article-title>
          <source>arXiv</source>
          <access-date>2021-03-04</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://gss.civilservice.gov.uk/guidances/quality/nsqr/privacy-and-data-confidentiality-">https://gss.civilservice.gov.uk/guidances/quality/nsqr/privacy-and-data-confidentiality-</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref23">
        <label>23</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Patel</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>Foundations of fairness: where next for NHS health data partnerships?</article-title>
          <source>Ada Lovelace Institute</source>
          <year>2020</year>
          <month>03</month>
          <access-date>2021-05-06</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://understandingpatientdata.org.uk/sites/default/files/2020-03/Foundations%20of%20Fairness%20-%20Summary%20and%20Analysis.pdf">https://understandingpatientdata.org.uk/sites/default/files/2020-03/Foundations%20of%20Fairness%20-%20Summary%20and%20Analysis.pdf</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref24">
        <label>24</label>
        <nlm-citation citation-type="web">
          <article-title>ODI survey reveals British consumer attitudes to sharing personal data</article-title>
          <source>Open Data Institute</source>
          <year>2018</year>
          <access-date>2020-05-07</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://theodi.org/article/odi-survey-reveals-british-consumer-attitudes-to-sharing-personal-data/">https://theodi.org/article/odi-survey-reveals-british-consumer-attitudes-to-sharing-personal-data/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref25">
        <label>25</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Aitken</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>de St Jorre</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Pagliari</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Jepson</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Cunningham-Burley</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Public responses to the sharing and linkage of health data for research purposes: a systematic review and thematic synthesis of qualitative studies</article-title>
          <source>BMC Med Ethics</source>
          <year>2016</year>
          <month>11</month>
          <day>10</day>
          <volume>17</volume>
          <issue>1</issue>
          <fpage>73</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://bmcmedethics.biomedcentral.com/articles/10.1186/s12910-016-0153-x"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/s12910-016-0153-x</pub-id>
          <pub-id pub-id-type="medline">27832780</pub-id>
          <pub-id pub-id-type="pii">10.1186/s12910-016-0153-x</pub-id>
          <pub-id pub-id-type="pmcid">PMC5103425</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref26">
        <label>26</label>
        <nlm-citation citation-type="web">
          <article-title>Summary report of qualitative research into public attitudes to personal data and linking personal data : summary report / Wellcome Trust</article-title>
          <source>Wellcome Trust (London, England)</source>
          <year>2013</year>
          <access-date>2020-12-08</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://wellcomelibrary.org/item/b20997358#?c=0&#38;m=0&#38;s=0&#38;cv=0">https://wellcomelibrary.org/item/b20997358#?c=0&#38;m=0&#38;s=0&#38;cv=0</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref27">
        <label>27</label>
        <nlm-citation citation-type="web">
          <article-title>Royal Society, Trust Deficiet -Lessons-for-Policymakers</article-title>
          <source>Royal Statistical Society (RSS)</source>
          <access-date>2019-07-01</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://www.statslife.org.uk/news/1672">http://www.statslife.org.uk/news/1672</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref28">
        <label>28</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Morris</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Public attitude to linking lifestyle data to health records for research: the LifeInfo study</article-title>
          <source>Forthcoming</source>
          <year>2021</year>
          <access-date>2021-05-10</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://osf.io/4bmpe/">https://osf.io/4bmpe/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref29">
        <label>29</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Roberts</surname>
              <given-names>ME</given-names>
            </name>
            <name name-style="western">
              <surname>Stewart</surname>
              <given-names>BM</given-names>
            </name>
            <name name-style="western">
              <surname>Tingley</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Lucas</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Leder-Luis</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Gadarian</surname>
              <given-names>SK</given-names>
            </name>
            <name name-style="western">
              <surname>Albertson</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Rand</surname>
              <given-names>DG</given-names>
            </name>
          </person-group>
          <article-title>Structural topic models for open-ended survey responses</article-title>
          <source>Am J Pol Sci</source>
          <year>2014</year>
          <month>03</month>
          <day>06</day>
          <volume>58</volume>
          <issue>4</issue>
          <fpage>1064</fpage>
          <lpage>82</lpage>
          <pub-id pub-id-type="doi">10.1111/ajps.12103</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref30">
        <label>30</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Blei</surname>
              <given-names>DM</given-names>
            </name>
            <name name-style="western">
              <surname>Ng</surname>
              <given-names>AY</given-names>
            </name>
            <name name-style="western">
              <surname>Jordan</surname>
              <given-names>MI</given-names>
            </name>
          </person-group>
          <article-title>Latent dirichlet allocation</article-title>
          <source>J Machi Learn Res 3</source>
          <year>2003</year>
          <access-date>2021-05-06</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.jmlr.org/papers/volume3/blei03a/blei03a.pdf">https://www.jmlr.org/papers/volume3/blei03a/blei03a.pdf</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref31">
        <label>31</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lund</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Williams</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>Summary report on the 2013 Healthy Communities survey</article-title>
          <source>Leeds City Council</source>
          <year>2013</year>
          <access-date>2020-10-01</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://leedsobs.wpengine.com/wp-content/uploads/2018/03/Healthy-Communities-Survey-2013-Analysis-Report.pdf">https://leedsobs.wpengine.com/wp-content/uploads/2018/03/Healthy-Communities-Survey-2013-Analysis-Report.pdf</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref32">
        <label>32</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Morris</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>LifeInfo study</article-title>
          <source>Open Science Framework</source>
          <access-date>2020-09-14</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://osf.io/2rsnd/">https://osf.io/2rsnd/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref33">
        <label>33</label>
        <nlm-citation citation-type="web">
          <article-title>R: a language and environment for statistical computing</article-title>
          <source>R Software</source>
          <year>2019</year>
          <access-date>2020-06-01</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.r-project.org">https://www.r-project.org</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref34">
        <label>34</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lau</surname>
              <given-names>JH</given-names>
            </name>
            <name name-style="western">
              <surname>Baldwin</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Newman</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>On collocations and topic models</article-title>
          <source>ACM Trans Speech Lang Process</source>
          <year>2013</year>
          <month>07</month>
          <volume>10</volume>
          <issue>3</issue>
          <fpage>1</fpage>
          <lpage>14</lpage>
          <pub-id pub-id-type="doi">10.1145/2483969.2483972</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref35">
        <label>35</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wallach</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>Topic modeling: beyond bag-of-words</article-title>
          <source>Proceedings of the 23rd international conference on Machine learning</source>
          <year>2006</year>
          <conf-name>23rd international conference on Machine learning</conf-name>
          <conf-date>2006</conf-date>
          <conf-loc>Carnegie Mellon University, Pittsburgh, Pennsylvania</conf-loc>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://dirichlet.net/pdf/wallach06topic.pdf"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref36">
        <label>36</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Nokel</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Loukachevitch</surname>
              <given-names>N</given-names>
            </name>
          </person-group>
          <article-title>A method of accounting bigrams in topic models</article-title>
          <source>Proceedings of the 11th Workshop on Multiword Expressions</source>
          <year>2015</year>
          <conf-name>11th Workshop on Multiword Expressions</conf-name>
          <conf-date>June 2015</conf-date>
          <conf-loc>Denver, Colorado</conf-loc>
          <pub-id pub-id-type="doi">10.3115/v1/w15-0901</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref37">
        <label>37</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Greene</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>O'Callaghan</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Cunningham</surname>
              <given-names>P</given-names>
            </name>
          </person-group>
          <article-title>How many topics? Stability analysis for topic models</article-title>
          <source>Machine Learning and Knowledge Discovery in Databases</source>
          <year>2014</year>
          <publisher-loc>Berlin, Heidelberg</publisher-loc>
          <publisher-name>Springer</publisher-name>
          <fpage>498</fpage>
          <lpage>513</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref38">
        <label>38</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Rosen-Zvi</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Griffiths</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Steyvers</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Smyth</surname>
              <given-names>P</given-names>
            </name>
          </person-group>
          <article-title>The author-topic model for authors and documents</article-title>
          <source>arXiv.org</source>
          <year>2012</year>
          <access-date>2020-07-13</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://arxiv.org/abs/1207.4169">http://arxiv.org/abs/1207.4169</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref39">
        <label>39</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Chang</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Boyd-Graber</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Gerrish</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Blei</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>Reading tea leaves: how humans interpret topic models</article-title>
          <source>Proceedings of the 23rd Annual Conference on Neural Information Processing Systems</source>
          <year>2009</year>
          <conf-name>23rd Annual Conference on Neural Information Processing Systems</conf-name>
          <conf-date>December 7-10, 2009</conf-date>
          <conf-loc>Vancouver, British Columbia, Canada</conf-loc>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://rexa.info"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref40">
        <label>40</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Mimno</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Wallach</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Talley</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Leenders</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Mccallum</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Optimizing semantic coherence in topic models</article-title>
          <source>Proceedings of the 2011 Conference on Empirical Methods in Natural Language Processing</source>
          <year>2011</year>
          <conf-name>Conference on Empirical Methods in Natural Language Processing</conf-name>
          <conf-date>July, 2011</conf-date>
          <conf-loc>Edinburgh, Scotland, UK</conf-loc>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.aclweb.org/anthology/D11-1024"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref41">
        <label>41</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Jones</surname>
              <given-names>TW</given-names>
            </name>
          </person-group>
          <source>Topic modeling</source>
          <year>2019</year>
          <access-date>2020-07-13</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://cran.r-project.org/web/packages/textmineR/vignettes/c_topic_modeling.html">https://cran.r-project.org/web/packages/textmineR/vignettes/c_topic_modeling.html</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref42">
        <label>42</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Rosner</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Hinneburg</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Röder</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Nettling</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Both</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Evaluating topic coherence measures</article-title>
          <source>Cornell University</source>
          <year>2014</year>
          <access-date>2020-07-13</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://arxiv.org/abs/1403.6397">http://arxiv.org/abs/1403.6397</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref43">
        <label>43</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Röder</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Both</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Hinneburg</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Exploring the space of topic coherence measures</article-title>
          <source>Proceedings of the Eighth ACM International Conference on Web Search and Data Mining</source>
          <year>2015</year>
          <conf-name>Eighth ACM International Conference on Web Search and Data Mining</conf-name>
          <conf-date>February, 2015</conf-date>
          <conf-loc>Shanghai, China</conf-loc>
          <fpage>399</fpage>
          <lpage>408</lpage>
          <pub-id pub-id-type="doi">10.1145/2684822.2685324</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref44">
        <label>44</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wallach</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Mimno</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Mccallum</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Rethinking LDA: why priors matter</article-title>
          <source>Proceedings of the 22nd International Conference on Neural Information Processing Systems</source>
          <year>2009</year>
          <conf-name>22nd International Conference on Neural Information Processing Systems</conf-name>
          <conf-date>December, 2019</conf-date>
          <conf-loc>Vancouver, B.C., Canada</conf-loc>
          <fpage>1973</fpage>
          <lpage>81</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://dl.acm.org/doi/abs/10.5555/2984093.2984314"/>
          </comment>
          <pub-id pub-id-type="doi">10.5555/2984093.2984314</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref45">
        <label>45</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Upton</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Cook</surname>
              <given-names>I</given-names>
            </name>
          </person-group>
          <source>A dictionary of statistics</source>
          <year>2014</year>
          <publisher-loc>Oxford</publisher-loc>
          <publisher-name>Oxford University Press</publisher-name>
        </nlm-citation>
      </ref>
      <ref id="ref46">
        <label>46</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Becker</surname>
              <given-names>W</given-names>
            </name>
          </person-group>
          <article-title>Comparability of household and individual food consumption data--evidence from Sweden</article-title>
          <source>Public Health Nutr</source>
          <year>2001</year>
          <month>10</month>
          <volume>4</volume>
          <issue>5B</issue>
          <fpage>1177</fpage>
          <lpage>82</lpage>
          <pub-id pub-id-type="medline">11924944</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref47">
        <label>47</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Skatova</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Goulding</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Psychology of personal data donation</article-title>
          <source>PLoS One</source>
          <year>2019</year>
          <month>11</month>
          <day>20</day>
          <volume>14</volume>
          <issue>11</issue>
          <fpage>e0224240</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://dx.plos.org/10.1371/journal.pone.0224240"/>
          </comment>
          <pub-id pub-id-type="doi">10.1371/journal.pone.0224240</pub-id>
          <pub-id pub-id-type="medline">31747408</pub-id>
          <pub-id pub-id-type="pii">PONE-D-19-16219</pub-id>
          <pub-id pub-id-type="pmcid">PMC6867598</pub-id>
        </nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
