<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article" dtd-version="2.0">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JMIR</journal-id>
      <journal-id journal-id-type="nlm-ta">J Med Internet Res</journal-id>
      <journal-title>Journal of Medical Internet Research</journal-title>
      <issn pub-type="epub">1438-8871</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
    <article-id pub-id-type="publisher-id">v19i3e65</article-id>
    <article-id pub-id-type="pmid">28298265</article-id>
    <article-id pub-id-type="doi">10.2196/jmir.6533</article-id>
    <article-categories>
      <subj-group subj-group-type="heading">
        <subject>Original Paper</subject>
      </subj-group>
      <subj-group subj-group-type="article-type">
        <subject>Original Paper</subject>
      </subj-group>
    </article-categories>
    <title-group>
      <article-title>Supervised Machine Learning Algorithms Can Classify Open-Text Feedback of Doctor Performance With Human-Level Accuracy</article-title>
    </title-group>
    <contrib-group>
      <contrib contrib-type="editor">
        <name>
          <surname>Parra-Calderón</surname>
          <given-names>Carlos Luis</given-names>
        </name>
      </contrib>
    </contrib-group>
    <contrib-group>
      <contrib contrib-type="reviewer">
        <name>
          <surname>Cruz-Díaz</surname>
          <given-names>Noa Patricia</given-names>
        </name>
      </contrib>
      <contrib contrib-type="reviewer">
        <name>
          <surname>Giacomelli</surname>
          <given-names>Piero</given-names>
        </name>
      </contrib>
      <contrib contrib-type="reviewer">
        <name>
          <surname>Singh</surname>
          <given-names>Harpreet</given-names>
        </name>
      </contrib>
    </contrib-group>
    <contrib-group>
      <contrib contrib-type="author" id="contrib1" corresp="yes" equal-contrib="yes">
      <name name-style="western">
        <surname>Gibbons</surname>
        <given-names>Chris</given-names>
      </name>
      <degrees>PhD</degrees>
      <xref rid="aff1" ref-type="aff">1</xref>
      <xref rid="aff2" ref-type="aff">2</xref>
      <address>
        <institution>The Psychometrics Centre</institution>
        <institution>University of Cambridge</institution>
        <addr-line>16 Mill Lane</addr-line>
        <addr-line>Cambridge, CB2 1RH</addr-line>
        <country>United Kingdom</country>
        <phone>44 1223 765 203</phone>
        <fax>44 1223 765 203</fax>
        <email>cg598@cam.ac.uk</email>
      </address>  
      <ext-link ext-link-type="orcid">http://orcid.org/0000-0002-4732-7305</ext-link></contrib>
      <contrib contrib-type="author" id="contrib2" equal-contrib="yes">
        <name name-style="western">
          <surname>Richards</surname>
          <given-names>Suzanne</given-names>
        </name>
        <degrees>PhD</degrees>
        <xref rid="aff3" ref-type="aff">3</xref>
        <ext-link ext-link-type="orcid">http://orcid.org/0000-0003-1416-0569</ext-link>
      </contrib>
      <contrib contrib-type="author" id="contrib3" equal-contrib="yes">
        <name name-style="western">
          <surname>Valderas</surname>
          <given-names>Jose Maria</given-names>
        </name>
        <degrees>PhD</degrees>
        <xref rid="aff4" ref-type="aff">4</xref>
        <ext-link ext-link-type="orcid">http://orcid.org/0000-0002-9299-1555</ext-link>
      </contrib>
      <contrib contrib-type="author" id="contrib4" equal-contrib="yes">
        <name name-style="western">
          <surname>Campbell</surname>
          <given-names>John</given-names>
        </name>
        <degrees>MD</degrees>
        <xref rid="aff4" ref-type="aff">4</xref>
        <ext-link ext-link-type="orcid">http://orcid.org/0000-0002-6752-3493</ext-link>
      </contrib>
    </contrib-group>
    <aff id="aff1">
    <sup>1</sup>
    <institution>Centre for Health Services Research</institution>
    <institution>University of Cambridge</institution>  
    <addr-line>Cambridge</addr-line>
    <country>United Kingdom</country></aff>
    <aff id="aff2">
    <sup>2</sup>
    <institution>The Psychometrics Centre</institution>
    <institution>University of Cambridge</institution>  
    <addr-line>Cambridge</addr-line>
    <country>United Kingdom</country></aff>
    <aff id="aff3">
    <sup>3</sup>
    <institution>Leeds Institute for Health Sciences</institution>
    <institution>University of Leeds</institution>  
    <addr-line>Leeds</addr-line>
    <country>United Kingdom</country></aff>
    <aff id="aff4">
    <sup>4</sup>
    <institution>Primary Care Research Group</institution>
    <institution>University of Exeter</institution>  
    <addr-line>Exeter</addr-line>
    <country>United Kingdom</country></aff>
    <author-notes>
      <corresp>Corresponding Author: Chris Gibbons 
      <email>cg598@cam.ac.uk</email></corresp>
    </author-notes>
    <pub-date pub-type="collection"><month>03</month><year>2017</year></pub-date>
    <pub-date pub-type="epub">
      <day>15</day>
      <month>03</month>
      <year>2017</year>
    </pub-date>
    <volume>19</volume>
    <issue>3</issue>
    <elocation-id>e65</elocation-id>
    <!--history from ojs - api-xml-->
    <history>
      <date date-type="received">
        <day>23</day>
        <month>8</month>
        <year>2016</year>
      </date>
      <date date-type="rev-request">
        <day>12</day>
        <month>9</month>
        <year>2016</year>
      </date>
      <date date-type="rev-recd">
        <day>30</day>
        <month>9</month>
        <year>2016</year>
      </date>
      <date date-type="accepted">
        <day>29</day>
        <month>11</month>
        <year>2016</year>
      </date>
    </history>
    <!--(c) the authors - correct author names and publication date here if necessary. Date in form ', dd.mm.yyyy' after jmir.org-->
    <copyright-statement>©Chris Gibbons, Suzanne Richards, Jose Maria Valderas, John Campbell. Originally published in the Journal of Medical Internet Research (http://www.jmir.org), 15.03.2017.</copyright-statement>
    <copyright-year>2017</copyright-year>
    <license license-type="open-access" xlink:href="http://creativecommons.org/licenses/by/2.0/">
      <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (http://creativecommons.org/licenses/by/2.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in the Journal of Medical Internet Research, is properly cited. The complete bibliographic information, a link to the original publication on http://www.jmir.org/, as well as this copyright and license information must be included.</p>
    </license>  
    <self-uri xlink:href="http://www.jmir.org/2017/3/e65/" xlink:type="simple"/>
    <abstract>
      <sec sec-type="background">
        <title>Background</title>
        <p>Machine learning techniques may be an effective and efficient way to classify open-text reports on doctor’s activity for the purposes of quality assurance, safety, and continuing professional development.</p>
      </sec>
      <sec sec-type="objective">
        <title>Objective</title>
        <p>The objective of the study was to evaluate the accuracy of machine learning algorithms trained to classify open-text reports of doctor performance and to assess the potential for classifications to identify significant differences in doctors’ professional performance in the United Kingdom.</p>
      </sec>
      <sec sec-type="methods">
        <title>Methods</title>
        <p>We used 1636 open-text comments (34,283 words) relating to the performance of 548 doctors collected from a survey of clinicians’ colleagues using the General Medical Council Colleague Questionnaire (GMC-CQ). We coded 77.75% (1272/1636) of the comments into 5 global themes (innovation, interpersonal skills, popularity, professionalism, and respect) using a qualitative framework. We trained 8 machine learning algorithms to classify comments and assessed their performance using several training samples. We evaluated doctor performance using the GMC-CQ and compared scores between doctors with different classifications using <italic>t</italic> tests.</p>
      </sec>
      <sec sec-type="results">
        <title>Results</title>
        <p>Individual algorithm performance was high (range <italic>F</italic> score=.68 to .83). Interrater agreement between the algorithms and the human coder was highest for codes relating to “popular” (recall=.97), “innovator” (recall=.98), and “respected” (recall=.87) codes and was lower for the “interpersonal” (recall=.80) and “professional” (recall=.82) codes. A 10-fold cross-validation demonstrated similar performance in each analysis. When combined together into an ensemble of multiple algorithms, mean human-computer interrater agreement was .88. Comments that were classified as “respected,” “professional,” and “interpersonal” related to higher doctor scores on the GMC-CQ compared with comments that were not classified (<italic>P</italic>&#60;.05). Scores did not vary between doctors who were rated as popular or innovative and those who were not rated at all (<italic>P</italic>&#62;.05).</p>
      </sec>
      <sec sec-type="conclusions">
        <title>Conclusions</title>
        <p>Machine learning algorithms can classify open-text feedback of doctor performance into multiple themes derived by human raters with high performance. Colleague open-text comments that signal respect, professionalism, and being interpersonal may be key indicators of doctor’s performance.</p>
      </sec>
    </abstract>
    <kwd-group>
      <kwd>machine learning</kwd>
      <kwd>surveys and questionnaires</kwd>
      <kwd>feedback</kwd>
      <kwd>data mining</kwd>
      <kwd>work performance</kwd>
    </kwd-group></article-meta>
  </front>
  <body>
    <sec sec-type="introduction">
      <title>Introduction</title>
      <p>Multisource “360-degree” feedback is increasingly used across business and health sectors to give workers insights into their performance and to identify areas in which improvements may be made. Such feedback often includes different reporting modalities that most commonly take the form of validated questionnaires or open-text comments. In the United Kingdom, large-scale national surveys include open-text feedback, such as the Friends and Family Test, the Inpatient Survey, and the Cancer Patient Experience Survey.</p>
      <p>The complexity of open-text information means that, unlike the scores from validated patient-reported experiences and outcome measures, the words cannot simply be “added up” to create insight and meaning. As such, the task of making sense of such data has historically been completed manually by skilled qualitative analysts.</p>
      <p>As the volume of text increases, qualitative data can quickly become difficult to manage and draw insights from. Coding and interpreting large bodies of qualitative information received from open-text comments collected is labor-intensive and is at risk of bias if multiple raters use subtly different coding heuristics. Where human raters systematically analyze qualitative data, there remain issues with both time and financial constraints of doing so, as well as potential challenges in ensuring intercoder consistency [<xref ref-type="bibr" rid="ref1">1</xref>].</p>
      <p>The term <italic>machine learning</italic> refers to the application of a growing number of algorithms that are able to complete diverse computational tasks, including mastering complex computer games [<xref ref-type="bibr" rid="ref2">2</xref>], understanding the meaning of sentences [<xref ref-type="bibr" rid="ref3">3</xref>], and successfully predicting psychological profiles from the Internet behavior [<xref ref-type="bibr" rid="ref4">4</xref>,<xref ref-type="bibr" rid="ref5">5</xref>].</p>
      <p>Although machine learning appears to be eminently suitable for the task of classifying open-text data from national surveys, its potential is largely untested in the context of comments made by medical professionals about doctors’ performance. Classification algorithms have been previously applied to patient comments about the experience of living beyond cancer [<xref ref-type="bibr" rid="ref6">6</xref>], clinical incident reports [<xref ref-type="bibr" rid="ref7">7</xref>,<xref ref-type="bibr" rid="ref8">8</xref>], and sentiment analysis of digital footprints including Twitter and online blogs [<xref ref-type="bibr" rid="ref9">9</xref>,<xref ref-type="bibr" rid="ref10">10</xref>].</p>
      <p>While algorithms have demonstrated excellent performance in diverse tasks, there is no evidence specifically relating to their ability to classify comments about doctors made by their colleagues as part of a formal evaluation. Although doctors’ performance might be best assessed by fellow professionals who know them very well, positive reporting bias in open-text reports may occlude differences in performance [<xref ref-type="bibr" rid="ref11">11</xref>,<xref ref-type="bibr" rid="ref12">12</xref>]. The challenge therefore is to classify differences in text that is often positively worded and to use these classifications to signal differences in doctors’ performance.</p>
      <p>The objective of this study was to train and evaluate an ensemble of machine learning algorithms to accurately classify open-text reports of doctors, which are known to be positively biased, and to assess the potential for theory-based classifications in open text to signal differences in doctors’ professional performance in the United Kingdom.</p>
    </sec>
    <sec sec-type="methods">
      <title>Methods</title>
      <sec>
        <title>Sample</title>
        <p>We collected data from all non–training-grade doctors from 11 sites in England and Wales between March 2008 and January 2011. We recruited doctors from 4 acute hospital trusts, an anesthetics department, 1 mental health trust, 4 primary care organizations, and 1 independent (non–National Health Service) health care organization. We provided all doctors with detailed information regarding the study before they consented to take part in it; they were told they could withdraw at any point without justification. Detailed description of this sample is reported elsewhere [<xref ref-type="bibr" rid="ref7">7</xref>,<xref ref-type="bibr" rid="ref8">8</xref>].</p>
        <p>Doctors were asked to suggest up to 20 colleagues (half of whom were to be medically qualified) who could provide multisource “360-degree” feedback regarding their professional performance.</p>
        <p>Multisource feedback was elicited using the General Medical Council Colleague Questionnaire (GMC-CQ), a reliable measure of doctor performance that is validated for use in the United Kingdom [<xref ref-type="bibr" rid="ref13">13</xref>]. The GMC-CQ contains 18 items assessing diverse aspects of doctor performance and a section for entering open-text feedback.</p>
      </sec>
      <sec>
        <title>Text Categorization</title>
        <p>Qualitative analysts inductively coded the open-text feedback from the GMC-CQ into 5 themes relating to (1) innovation and openness to change (59/1636 comments, 3.6%); (2) interpersonal skills and caring (432/1636 comments, 26.4%); (3) popularity (131/1636 comments, 8%); (4) professionalism (701/1636 comments, 42.8%); and (5) respect or esteem in which the doctor was held (346/1636 comments, 21.1%) [<xref ref-type="bibr" rid="ref12">12</xref>]. We refer to these categories throughout the rest of the paper as innovator, interpersonal skills, popularity, professionalism, and respect. Classification of a comment into more than one theme was possible. Of the 1636 reports, 1211 (74%) were classified as belonging to at least one of these categories. Similarly, classification of doctors into more than one category was possible, and 648 (28.8%) reports were classified into one or more of the 5 categories; as such, there were 2858 human-labeled comments in the entire corpus.</p>
        <p>The number of comments in each category, the distribution of words, and statistical comparison of the word length are provided in <xref ref-type="table" rid="table1">Table 1</xref>. Significant analysis of variance (ANOVA; with post hoc Tukey test) results indicate that the number of words in texts that were granted the label of “innovator” was significantly greater than all other categories, whereas comments which received a label or “respected” or no label at all were significantly shorter.</p>
        <table-wrap position="float" id="table1">
          <label>Table 1</label>
          <caption>
            <p>Number of comments, distribution of words, and statistical comparison for each of the 5 categories.</p>
          </caption>
          <table width="499" cellpadding="7" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="152"/>
            <col width="101"/>
            <col width="109"/>
            <col width="80"/>
            <thead>
              <tr valign="top">
                <td>Categories</td>
                <td>Reports in category</td>
                <td>Length of report, mean (SD)</td>
                <td>ANOVA<sup>a</sup><italic>P</italic> value</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>Innovator</td>
                <td>59</td>
                <td>41.99 (30.84)</td>
                <td>&#60;.001</td>
              </tr>
              <tr valign="top">
                <td>Interpersonal</td>
                <td>432</td>
                <td>23.87 (16.39)</td>
                <td>.99</td>
              </tr>
              <tr valign="top">
                <td>Popular</td>
                <td>131</td>
                <td>25.49 (16.74)</td>
                <td>.97</td>
              </tr>
              <tr valign="top">
                <td>Professional</td>
                <td>701</td>
                <td>24.46 (17.34)</td>
                <td>.91</td>
              </tr>
              <tr valign="top">
                <td>Respected</td>
                <td>346</td>
                <td>20.69 (19.13)</td>
                <td>.03</td>
              </tr>
              <tr valign="top">
                <td>More than 1 category</td>
                <td>1189</td>
                <td>21.63 (16.76)</td>
                <td>.56</td>
              </tr>
              <tr valign="top">
                <td>No categories</td>
                <td>425</td>
                <td>19.54 (13.62)</td>
                <td>&#60;.001</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table1fn1">
              <p><sup>a</sup>ANOVA: analysis of variance; conducted with post hoc Tukey tests.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <p>The qualitative researchers followed Holsti’s approach [<xref ref-type="bibr" rid="ref14">14</xref>]. Using rigorous data coding and verification procedures, which included double coding and independent verification within a qualitative framework [<xref ref-type="bibr" rid="ref12">12</xref>], the resultant data were coded in such a way as to support quantitative data analysis.</p>
        <p>Comments were generally, though not always, positive. In our sample, 91.5% (1497/1636) of all comments were positive, 5.93% (97/1636) of the comments were mixed, containing both a positive and a negative statement about the doctor, and the remaining 2.57% (42/1636) of comments were either neutral or negative (see <xref ref-type="table" rid="table2">Table 2</xref>). Prior publications relating to this dataset give further information on the process of ascertaining the polarity of comments [<xref ref-type="bibr" rid="ref12">12</xref>].</p>
        </sec>
<sec>
        <title>Assessment of Machine Learning Algorithm Performance</title>
        <p>The process of training, validating, and deploying the algorithms is illustrated in <xref ref-type="fig" rid="figure1">Figure 1</xref>.</p>


<table-wrap position="float" id="table2">
          <label>Table 2</label>
          <caption>
            <p>Example quotes from each category</p>
          </caption>
          <table width="688" cellpadding="7" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="124"/>
            <col width="533"/>
            <thead>
              <tr valign="top">
                <td>Theme</td>
                <td>Comment</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>Innovator</td>
                <td>“It is clear from the advice he gives that he is aware of [the] current good practice, is highly motivated, very practical and very much a team player. His advice, when working with consultant colleagues was respected, and he recognized where practice/primary care limitations were and yet looked for opportunities for change and improvement.”</td>
              </tr>
              <tr valign="top">
                <td><break/></td>
                <td>“She has an admirable level of commitment and enthusiasm for her patients and her work. She has been instrumental in promoting change and improvement in her department. She is a great asset to the department and the hospital.”</td>
              </tr>
              <tr valign="top">
                <td>Interpersonal</td>
                <td>“She is a very good, committed colleague always keen to improve, very liked by her patients and highly valued by all who work with her.”</td>
              </tr>
              <tr valign="top">
                <td><break/></td>
                <td>“Very approachable and professional.”</td>
              </tr>
              <tr valign="top">
                <td>Popular</td>
                <td>“Excellent well liked and easy working colleague.”</td>
              </tr>
              <tr valign="top">
                <td><break/></td>
                <td>“Very popular doctor. Works to high standards.”</td>
              </tr>
              <tr valign="top">
                <td>Professional</td>
                <td>“I find this doctor to be very efficient, caring, honest and very professional.”</td>
              </tr>
              <tr valign="top">
                <td><break/></td>
                <td>“I find that he very easy and helpful to work with, he always has time for patients and staff.”</td>
              </tr>
              <tr valign="top">
                <td>Respected</td>
                <td>“A first class colleague.”</td>
              </tr>
              <tr valign="top">
                <td><break/></td>
                <td>“Pleasant and valued colleague.”</td>
              </tr>
              <tr valign="top">
                <td>Not coded by qualitative rater (given label of 0)</td>
                <td>“Supportive colleague, excellent time management skills.” <break/><break/></td>
              </tr>
              <tr valign="top">
                <td><break/></td>
                <td>“I think I have a good working relationship with this doctor. I have been impressed with his openness to Psychological work with his patients and his support for my work. In my opinion he gives thorough consideration to his diagnosis.”</td>
              </tr>
            </tbody>
          </table>
        </table-wrap>
      
        <fig id="figure1" position="float">
          <label>Figure 1</label>
          <caption>
            <p>Flow diagram of the stages “training,” “validation,” and “application to new data.”.</p>
          </caption>
          <graphic xlink:href="jmir_v19i3e65_fig1.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Feature Selection</title>
        <p>The first step within each stage is the identification of features within the comments. The features used in this study were identified and stored using a term-document matrix that describes the frequency of terms that appear in each of the comments. The term-document matrix uses a bag-of-words structure that counts the number of terms in each comment and does not consider the order in which the words appear. Term-document matrices are a simple way to represent text data that are computationally straightforward. The matrix comprised unweighted words and was cleaned by stemming, removing numbers, and removing sparse terms (where a certain word was only used in fewer than 0.02% of cases) [<xref ref-type="bibr" rid="ref15">15</xref>]. Sparse-term removal reduced the number of terms from 1737 to 616. The final term-document matrix contained a lexicon of 616 unique words (columns) for 1636 comments (rows). The matrix density was 5.8%.</p>
        <p>The term frequencies for each comment were therefore used as features that the algorithms used to classify the text. An example of a term-document matrix is provided in <xref ref-type="table" rid="table3">Table 3</xref>. For each of the 5 categories, texts with a human classification in that category were labeled with a 1 and those without were labeled with a 0.</p>
        <table-wrap position="float" id="table3">
          <label>Table 3</label>
          <caption>
            <p>An example term-document matrix for 3 texts.</p>
          </caption>
          <table width="716" cellpadding="7" cellspacing="0" border="1" rules="groups" frame="hsides">
            <colgroup>
              <col width="39"/>
            </colgroup>
            <colgroup>
              <col width="14"/>
              <col width="24"/>
              <col width="62"/>
              <col width="43"/>
              <col width="33"/>
              <col width="24"/>
              <col width="52"/>
              <col width="62"/>
              <col width="33"/>
              <col width="80"/>
              <col width="33"/>
              <col width="32"/>
            </colgroup>
            <thead>
              <tr valign="top">
                <td>Texts</td>
                <td colspan="12">Terms</td>
              </tr>
              <tr valign="top">
                <td><break/></td>
                <td>a</td>
                <td>and</td>
                <td>colleague</td>
                <td>doctor</td>
                <td>great</td>
                <td>is</td>
                <td>patients</td>
                <td>respected</td>
                <td>this</td>
                <td>troublesome</td>
                <td>well</td>
                <td>with</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>Text 1<sup>a</sup></td>
                <td>1</td>
                <td>0</td>
                <td>1</td>
                <td>0</td>
                <td>1</td>
                <td>0</td>
                <td>0</td>
                <td>0</td>
                <td>0</td>
                <td>0</td>
                <td>0</td>
                <td>0</td>
              </tr>
              <tr valign="top">
                <td>Text 2<sup>b</sup></td>
                <td>1</td>
                <td>0</td>
                <td>1</td>
                <td>0</td>
                <td>0</td>
                <td>0</td>
                <td>0</td>
                <td>0</td>
                <td>0</td>
                <td>1</td>
                <td>0</td>
                <td>0</td>
              </tr>
              <tr valign="top">
                <td>Text 3<sup>c</sup></td>
                <td>0</td>
                <td>1</td>
                <td>0</td>
                <td>1</td>
                <td>1</td>
                <td>1</td>
                <td>1</td>
                <td>1</td>
                <td>1</td>
                <td>0</td>
                <td>1</td>
                <td>1</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table3fn1">
              <p><sup>a</sup>Text 1: “A great colleague.”</p>
            </fn>
            <fn id="table3fn2">
              <p><sup>b</sup>Text 2: “A troublesome colleague.”</p>
            </fn>
            <fn id="table3fn3">
              <p><sup>c</sup>Text 3: “This doctor is well respected, and great with patients.”</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <p>Once the features have been extracted, they are used to “train” the algorithms to describe the relationship between the features and the classification.</p>
        <p>In the validation stage the classifications given by the ensemble to new data are compared with the classifications made by the human qualitative analysts. If the results of the validation stage are acceptable, the algorithms can be exported and used to classify new data independently of the dataset that was used to train and validate the models.</p>
        <p>The steps of these stages are given in greater detail in <xref ref-type="fig" rid="figure1">Figure 1</xref>.</p>
      </sec>
      <sec>
        <title>Algorithms</title>
        <p>“RTextTools” brings together other packages that contain different machine learning algorithms and provides a system by which the performance of each algorithm can be assessed both individually and as a collected ensemble of different methods that are combined to maximize performance in the training dataset. We included all of the available algorithms within RTextTools apart from the neural network, which did not converge in pilot assessments. The algorithms were support vector machine (SVM) using the radial basis function kernel with the penalty parameter of error term set to 1 and a gamma parameter set to 1/number of features [<xref ref-type="bibr" rid="ref16">16</xref>], scaled linear discriminant analysis (SLDA) with eigenvalue threshold set to ≥1, bootstrapped boosting (bagging) with 25 bootstrap replications [<xref ref-type="bibr" rid="ref17">17</xref>], boosting [<xref ref-type="bibr" rid="ref18">18</xref>], random classification and regression forests with 500 trees [<xref ref-type="bibr" rid="ref19">19</xref>], classification and regression tree [<xref ref-type="bibr" rid="ref20">20</xref>], maximum entropy without regularization [<xref ref-type="bibr" rid="ref21">21</xref>], and generalized linear models with L1 (lasso) penalized regularization (GLM/LASSO) [<xref ref-type="bibr" rid="ref22">22</xref>].</p>
        <p>A review and summary of supervised machine learning algorithms can be found elsewhere [<xref ref-type="bibr" rid="ref23">23</xref>].</p>
      </sec>
      <sec>
        <title>Training and Validation</title>
        <p>Algorithms were trained with a corpus of 1000 randomly selected precoded comments (see <xref ref-type="fig" rid="figure1">Figure 1</xref>, part “a”) and validated on the remaining 636 comments (see <xref ref-type="fig" rid="figure1">Figure 1</xref>, part “b”).</p>
        <p>We assessed algorithm performance using statistics of (1) recall (analogous to sensitivity)—what proportion of cases in a class are correctly assigned to the class; (2) precision (analogous to specificity)—how often a case that is predicted to belong to a class does belong to that class; and (3) <italic>F</italic> score, which is a combination of both recall and precision where 1 represents the best performance and 0 the worst performance [<xref ref-type="bibr" rid="ref24">24</xref>]. To maximize performance, algorithms are combined into a consensus “ensemble” consisting of multiple algorithms. The consensus ensemble is a collection of algorithms that make the same prediction concerning the class of a text in the training dataset. We included the group of algorithms that had full agreement on every document in the dataset in the training sample. Classification was performed using majority voting between the algorithms in the ensemble.</p>
        <p>When assessed as an ensemble of multiple algorithms working together, recall is evaluated alongside coverage (the proportion of cases within the dataset to which the recall value applies) [<xref ref-type="bibr" rid="ref21">21</xref>]. The <italic>F</italic> value is analogous to interrater reliability and, as such, we will accept agreements ≥.80 between the algorithms and the human codes as evidence that the algorithms can complete the categorization task with acceptable accuracy.</p>
      </sec>
      <sec>
        <title>n-Fold Validation</title>
        <p>In addition to the standard assessment of algorithm performance using a validation dataset (636 comments), stability of algorithm performance across different data was also tested using an n-fold cross-validation. In the current analysis, a 10-fold validation was used in which 10 randomly selected samples of 1000 comments were selected from the dataset and validated using the remaining documents.</p>
        <p>This analysis will indicate the robustness of the algorithms and their suitability for application to novel data and is preferable to split-half validation or bootstrapping [<xref ref-type="bibr" rid="ref6">6</xref>,<xref ref-type="bibr" rid="ref25">25</xref>].</p>
      </sec>
      <sec>
        <title>Sample Size Accuracy Trade-Off</title>
        <p>As well as the precision of the algorithms it is important to assess their training efficiency (the relationship between performance and size of the training dataset) so that we might best understand how to apply these techniques in practice. We compared training efficiency performance using randomly selected training sets of 1000, 750, 500, 250, 100, and 50 to accurately classify randomly selected comments from a fixed-size validation set (636 cases).</p>
      </sec>
      <sec>
        <title>Assessment of Group Differences</title>
        <p>To assess the ability of these categories to highlight differences in global performance, we investigated the differences in GMC-CQ scores for doctors whose comments were classified into at least 1 of 5 categories and those who were not placed in any category. We hypothesized that doctors who were placed into one or more categories would perform better than those doctors who were not classified into any of the categories.</p>
        <p>We conducted this analysis using both the machine-coded dataset (the entire dataset was recoded by the algorithm blinded to the codes given by the human rater and the original human-rated dataset.</p>
        <p>Questionnaire data were scored using the graded response model [<xref ref-type="bibr" rid="ref22">22</xref>]. All items fit the graded response model (chi-square interaction <italic>P</italic>&#62;.01) and overall model fit was good (root mean square error of approximation =.048, comparative fit index =.97) [<xref ref-type="bibr" rid="ref26">26</xref>,<xref ref-type="bibr" rid="ref27">27</xref>]. The scale’s marginal reliability was .76. This analysis was conducted so that interval-scaled logit scores (theta) could be extracted from the model to use in the comparative analysis. This technique has been shown to increase sensitivity to detect change on questionnaire measures of quality of life [<xref ref-type="bibr" rid="ref28">28</xref>]. Correlation between theta values and scale raw scores was .95. Further details on the process of item response theory scoring and analysis can be found elsewhere [<xref ref-type="bibr" rid="ref27">27</xref>,<xref ref-type="bibr" rid="ref29">29</xref>,<xref ref-type="bibr" rid="ref30">30</xref>].</p>
      </sec>
      <sec>
        <title>Statistical Analysis</title>
        <p>Computational text classification and statistical analysis were conducted within the R statistical programming environment (R Foundation) [<xref ref-type="bibr" rid="ref31">31</xref>] using the “RTextTools” package for training the algorithms and the “base” package for conducting between-group comparisons. Figures were plotted using “ggplot2.”</p>
      </sec>
      <sec>
        <title>Ethical Approval</title>
        <p>The study was originally considered by the Devon and Torbay NHS Research Ethics Committee but judged not to require a formal ethics submission. No subsequent ethical approval was sought for the secondary analyses on the anonymized datasets presented here.</p>
      </sec>
    </sec>
    <sec sec-type="results">
      <title>Results</title>
      <sec>
        <title>Assessment of Algorithm Performance</title>
        <p><xref ref-type="table" rid="table4">Table 4</xref> presents the summary performance statistics for the algorithms and their individual <italic>F</italic> scores and the recall values for the ensemble of algorithms.</p>
        <table-wrap position="float" id="table4">
          <label>Table 4</label>
          <caption>
            <p>Summary of algorithm and ensemble performance in the main analysis.</p>
          </caption>
          <table width="600" cellpadding="7" cellspacing="0" border="1" rules="groups" frame="hsides">
    
              <col width="80"/>
              <col width="26"/>
              <col width="28"/>
              <col width="28"/>
              <col width="28"/>
              <col width="28"/>
              <col width="28"/>
              <col width="21"/>
          
            <thead>
              <tr valign="top">
                <td>Model<sup>a</sup></td>
                <td>Metric</td>
                <td>Innovator</td>
                <td>Interpersonal</td>
                <td>Popular</td>
                <td>Professional</td>
                <td>Respected</td>
                <td>Average</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>Support vector machine</td>
                <td><italic>F</italic> score</td>
                <td>.73</td>
                <td>.69</td>
                <td>.84</td>
                <td>.79</td>
                <td>.73</td>
                <td>.76</td>
              </tr>
              <tr valign="top">
                <td>Scaled linear discriminant analysis</td>
                <td><italic>F</italic> score</td>
                <td>.77</td>
                <td>.65</td>
                <td>.88</td>
                <td>.73</td>
                <td>.77</td>
                <td>.76</td>
              </tr>
              <tr valign="top">
                <td>Boosting</td>
                <td><italic>F</italic> score</td>
                <td>.75</td>
                <td>.77</td>
                <td>.81</td>
                <td>.76</td>
                <td>.75</td>
                <td>.77</td>
              </tr>
              <tr valign="top">
                <td>Bootstrap boosting</td>
                <td><italic>F</italic> score</td>
                <td>.87</td>
                <td>.85</td>
                <td>.83</td>
                <td>.80</td>
                <td>.82</td>
                <td>.83</td>
              </tr>
              <tr valign="top">
                <td>Random forests</td>
                <td><italic>F</italic> score</td>
                <td>.67</td>
                <td>.59</td>
                <td>.87</td>
                <td>.78</td>
                <td>.74</td>
                <td>.75</td>
              </tr>
              <tr valign="top">
                <td>Decision tree</td>
                <td><italic>F</italic> score</td>
                <td>.80</td>
                <td>.75</td>
                <td>.88</td>
                <td>.78</td>
                <td>.80</td>
                <td>.80</td>
              </tr>
              <tr valign="top">
                <td>Generalized linear model</td>
                <td><italic>F</italic> score</td>
                <td>.89</td>
                <td>.82</td>
                <td>.88</td>
                <td>.81</td>
                <td>.89</td>
                <td>.85</td>
              </tr>
              <tr valign="top">
                <td>Maximum entropy</td>
                <td><italic>F</italic> score</td>
                <td>.70</td>
                <td>.62</td>
                <td>.73</td>
                <td>.65</td>
                <td>.70</td>
                <td>.68</td>
              </tr>
              <tr valign="top">
                <td>Final ensemble (3+ models with<break/>agreement for the entire dataset)</td>
                <td>Recall<break/>with 100% agreement</td>
                <td>.98</td>
                <td>.80</td>
                <td>.97</td>
                <td>.82</td>
                <td>.87</td>
                <td>.89</td>
              </tr>
              <tr valign="top">
                <td>10-Fold validation mean (range)</td>
                <td><italic>F</italic> score</td>
                <td>.97 (.96-.98)</td>
                <td>.80 (.74-.86)</td>
                <td>.97 (.96-.98)</td>
                <td>.79 (.75-.83)</td>
                <td>.86 (.84-.89)</td>
                <td>.88</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table4fn1">
              <p><sup>a</sup>Training set size=1000; validation=636.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <sec>
          <title>Innovator</title>
          <p>The GLM/LASSO algorithm was the single highest-performing algorithm for correctly classifying the open-text comments into the “innovator” category. The ensemble of 3 algorithms (GLM/LASSO, bootstrapped boosting, and regression tree) has a 98% recall agreement with the human coder. The 10-fold validation indicated robust accuracy scores between .96 and .98 (mean .97). In the whole dataset, 48 comments (3.5%) were classified as an innovator by the algorithms.</p>
        </sec>
        <sec>
          <title>Interpersonal</title>
          <p>The bootstrapped boosting algorithm was the best-performing algorithm for categorizing open-text comments into the “interpersonal skills” category. The ensemble of 3 algorithms (boosting, GLM/LASSO, and bootstrapped boosting) demonstrated an 80% recall agreement with the human-coded dataset, the lowest performance for any of the classes. The 10-fold validation indicated similar performance in each fold and agreement values between .74 and .86 (mean .80). The algorithms classified 435 comments (28.4%) as “interpersonal.”</p>
        </sec>
        <sec>
          <title>Popular</title>
          <p>All algorithms performed exceptionally well in classifying open-text comments into the “popular” category, with <italic>F</italic> scores greater than .80 for all but maximum entropy (<italic>F</italic> score=.73). The ensemble performance (SLDA, decision tree, and GLM/LASSO ) was also excellent, with an interrater recall agreement of .97 (10-fold validation range .96-.98, mean .97). In total, 107 comments (8.3%) were placed in the “popular” category.</p>
        </sec>
        <sec>
          <title>Professional</title>
          <p>Similar performance was evident for many algorithms including SVM, random forests, and GLM/LASSO . Overall ensemble performance (GLM/LASSO, bootstrap boosting, and SVM) had an interrater recall of .82. The 10-fold validation suggested good agreement between the algorithm and the human analyst (mean .79, range .75-.83). The algorithms classified almost half of the comments in the whole dataset into the “professional” category (643 comments, 42.7%).</p>
        </sec>
        <sec>
          <title>Respected</title>
          <p>Once again, the GLM/LASSO algorithm showed the strongest single performance in the classification task for the “respected” category (<italic>F</italic> score=.89). The overall performance of the ensemble was very high, with .87 recall between the human coder and the 3-algorithm ensemble. The 10-fold validation demonstrated greater agreement between the human analyst and the algorithms (mean .86, range .84-.89). In the whole dataset, the ensemble classified 243 (16.6%) comments into the “respected” category.</p>
        </sec>
        <sec>
          <title>Overall Performance</title>
          <p>The GLM/LASSO algorithm was the strongest performing individual algorithm and the maximum entropy the worst. The overall average performance was nevertheless high (<italic>F</italic> score=.77). Average agreement between the human coder and the ensemble of algorithms was high (.89).</p>
        </sec>
      </sec>
      <sec>
        <title>n-Fold Validation of Ensemble Accuracy</title>
        <p>Results for the 10-fold cross-validation were very similar for the final recall values for the individual samples. The n-fold result displayed a tight distribution over the 10 samples (<xref ref-type="table" rid="table4">Table 4</xref>), indicating that the ensemble performs robustly across different samples.</p>
      </sec>
      <sec>
        <title>Algorithm Performance With Differing Sample Sizes</title>
        <p>As the training sample size was reduced, the algorithms continued to perform well but fell sharply when the training dataset was reduced to fewer than 250 comments. <xref ref-type="fig" rid="figure2">Figure 2</xref> shows the algorithm performance with different training sample sizes.</p>
        <fig id="figure2" position="float">
          <label>Figure 2</label>
          <caption>
            <p>Algorithm performance with differing training sample sizes. Performance decreases as expected with smaller training corpora.</p>
          </caption>
          <graphic xlink:href="jmir_v19i3e65_fig2.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Assessment of Group Differences</title>
        <p>The <italic>t</italic> tests demonstrated a significant difference in the GMC-CQ scores between doctors who received comments that placed them into 1 of the 5 categories and those who did not (<italic>t</italic><sub>173.81</sub>=0.77, <italic>P</italic>=.001). Although the results were significant, there was sizeable overlap in the distributions as shown in <xref ref-type="fig" rid="figure3">Figure 3</xref>, indicating that open-text classification alone was not sufficient to reliably distinguish between doctors’ performance. The largest difference in mean performance was between doctors who were classified as “respected” and those without a classification (<italic>t</italic><sub>629.17</sub>=3.75, <italic>P</italic>&#60;.001). There was no difference in mean performance scores between doctors who were classified as “popular” (<italic>P</italic>=.44) and those who were not. Similarly, being rated as “innovative” did not signal higher performance (<italic>P</italic>=.99), although the low numbers in the analysis suggest a lack of power to detect an effect (n=48). <xref ref-type="table" rid="table5">Table 5</xref> presents the results of these analyses conducted with both the machine learning classifications (<xref ref-type="table" rid="table5">Table 5</xref>, “Panel A”) and the human classifications (<xref ref-type="table" rid="table5">Table 5</xref>, “Panel B”). The results are similar between the human-rated and machine-rated datasets, with stronger effect sizes being reported in the human-classified group.</p>
        <table-wrap position="float" id="table5">
          <label>Table 5</label>
          <caption>
            <p>Comparison of means between doctors classified into a category and those who were unclassified.</p>
          </caption>
          <table width="600" cellpadding="7" cellspacing="0" border="1" rules="groups" frame="hsides">
              <col width="75"/>
              <col width="25"/>
              <col width="40"/>           
              <col width="25"/>
              <col width="25"/>
              <col width="25"/>
              <col width="10"/>
              <col width="25"/>
              <col width="40"/>
              <col width="25"/>
              <col width="25"/>
              <col width="25"/>
            <thead>
              <tr valign="top">
                <td>Categories</td>
                <td colspan="5">Panel A<sup>a</sup></td>
                <td><break/></td>
             <td colspan="5">Panel B<sup>b</sup></td>
              </tr>
              <tr valign="top">
                <td><break/></td>
                <td>Mean score<break/>(logits)</td>
                <td>Reports in<break/>category</td>
                <td colspan="3"><italic>t</italic> test<break/>(vs no category rating)</td>
                <td><break/></td>
                <td>Mean score<break/>(logits)</td>
                <td>Reports in<break/>category</td>
                <td colspan="3"><italic>t</italic> test<break/>(vs no category rating)</td>
              </tr>
              <tr valign="top">
                <td><break/></td>
                <td><break/></td>
                <td><break/></td>
                <td><italic>t</italic></td>
                <td>df<sup>c</sup></td>
                <td><italic>P</italic></td>
                <td><break/></td>
                <td><break/></td>
                <td><break/></td>
<td><italic>t</italic></td>
                <td>df</td>
                <td><italic>P</italic></td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>Innovator</td>
                <td>0.00</td>
                <td>48</td>
                <td>0.00</td>
                <td>55.74</td>
                <td>.99</td>
 <td><break/></td>
                <td>0.01</td>
                <td>59</td>
                <td>1.14</td>
                <td>35.69</td>
                <td>.26</td>
              </tr>
              <tr valign="top">
                <td>Interpersonal</td>
                <td>1.97</td>
                <td>435</td>
                <td>1.98</td>
                <td>857.97</td>
                <td>.04</td>
                <td><break/></td>
<td>0.07</td>
                <td>432</td>
                <td>2.97</td>
                <td>346.63</td>
                <td>&#60;.01</td>
              </tr>
              <tr valign="top">
                <td>Popular</td>
                <td>−0.05</td>
                <td>107</td>
                <td>−0.88</td>
                <td>176.42</td>
                <td>.38</td>
                <td><break/></td>
<td>0.13</td>
                <td>131</td>
                <td>1.32</td>
                <td>149.05</td>
                <td>.19</td>
              </tr>
              <tr valign="top">
                <td>Professional</td>
                <td>−0.03</td>
                <td>643</td>
                <td>2.51</td>
                <td>901.34</td>
                <td>.01</td>
<td><break/></td>
                <td>0.1</td>
                <td>701</td>
                <td>3.47</td>
                <td>286.99</td>
                <td>&#60;.001</td>
              </tr>
              <tr valign="top">
                <td>Respected</td>
                <td>0.15</td>
                <td>243</td>
                <td>3.75</td>
                <td>629.17</td>
                <td>&#60;.001</td>
<td><break/></td>
                <td>0.44</td>
                <td>346</td>
                <td>5.58</td>
                <td>300.13</td>
                <td>&#60;.001</td>
              </tr>
              <tr valign="top">
                <td>More than 1 category</td>
                <td>0.04</td>
                <td>1081</td>
                <td>0.77</td>
                <td>173.81</td>
                <td>.001</td>
                <td><break/></td>
<td>0.12</td>
                <td>1189</td>
                <td>3.81</td>
                <td>239.8</td>
                <td>&#60;.001</td>
              </tr>
              <tr valign="top">
                <td>No categories</td>
                <td>−0.09</td>
                <td>413</td>
                <td>N/A<sup>d</sup></td>
                <td>N/A</td>
                <td>N/A</td>
                <td><break/></td>
<td>−0.4</td>
                <td>425</td>
                <td>N/A</td>
                <td>N/A</td>
                <td>N/A</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table5fn1">
              <p><sup>a</sup>Panel A: analysis using machine ensemble classifications on entire corpus.</p>
            </fn>
            <fn id="table5fn2">
              <p><sup>b</sup>Panel B: analysis using human rater classifications on entire corpus.</p>
            </fn>
            <fn id="table5fn3">
              <p><sup>c</sup>df: degrees of freedom.</p>
            </fn>
            <fn id="table5fn4">
              <p><sup>d</sup>N/A: not applicable.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <fig id="figure3" position="float">
          <label>Figure 3</label>
          <caption>
            <p>Comparison of General Medical Council Colleague Questionnaire (GMC-CQ) scores between doctors who were placed in 1 of the 5 categories versus those who were not (positive comments only). Significance (P) values for the t tests are shown to indicate the relationship between the 2 groups.</p>
          </caption>
          <graphic xlink:href="jmir_v19i3e65_fig3.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
    </sec>
    <sec sec-type="discussion">
      <title>Discussion</title>
      <sec>
        <title>Principal Findings</title>
        <p>This study demonstrates the ability of machine learning algorithms to categorize qualitative data with high performance. The integration of such algorithms into data analysis tool kits for nationwide surveys may allow rich qualitative data to be analyzed without the resource burden associated with expert human ratings across an entire corpus [<xref ref-type="bibr" rid="ref32">32</xref>].</p>
        <p>We also demonstrate the ability of categories to highlight differences in overall doctor performance that were statistically significant. We hypothesized that doctors who were classified into 1 of the 5 categories would have higher scores on the GMC-CQ than those who were unclassified. We found partial support for this hypothesis: doctors who were classified as “respected,” “professional,” and “interpersonal” tended to outperform unclassified doctors, whereas no significant difference in performance was evident between doctors who were classified as “popular,” “innovative,” and those who were not classified into any of the 5 categories. However, the number of doctors who received a classification of “innovative” was low, which resulted in tests with low power and potential for type II error. Doctors with multiple ratings performed better than those without any ratings. Doctors who were classified as “respected” had the highest performance of each group in both the human-rated and machine-rated datasets.</p>
        <p>These techniques have clear potential for developing actionable insights in diverse specialties: they have also been used to classify patient-derived open-text comments in national cancer surveys [<xref ref-type="bibr" rid="ref6">6</xref>]. A key advantage of these techniques is the possibility of deploying trained algorithms to operate on data as they are being collected, allowing real-time feedback and insight from open-text data [<xref ref-type="bibr" rid="ref33">33</xref>], which may be used to monitor performance, and possibly safety, in the future. It is important to remember that although machine learning algorithms can perform to a high level in prediction or classification tasks, some operate as “black box” and it is often difficult, or even impossible, to generate theory or convey insight into <italic>how</italic> the algorithms arrived at their final solutions.</p>
        <p>We trained algorithms using precoded data and validated their performance on uncoded data, an example of “supervised” machine learning. We demonstrate strong performance using a relatively simple sparse “term-document matrix” method of identifying features in open text. The term-document matrix simply counts the instances of a word’s use within a comment and does not consider the order in which the words are presented. This approach has been used in similar studies published in the medical literature [<xref ref-type="bibr" rid="ref6">6</xref>,<xref ref-type="bibr" rid="ref8">8</xref>].</p>
        <p>It is possible to extract features using different, more complex, methods. Feature extraction using <italic>n</italic>-grams offers a means to retain some of the context in which words are used. An <italic>n</italic>-gram tokenizes sequences (of length <italic>n</italic>) of words as features, which may provide better information than the simple word-count strategy utilized in term-document matrix. Similarly, dimension reduction or clustering techniques such as latent Dirichlet allocation or singular value decomposition may have been used to reduce the sparsity within the matrix. There are no simple guidelines suggesting the optimal matrix density for use in this context and the possible benefits of clustering and dimension reduction must be counterbalanced by the caveat that these techniques can reduce the interpretability and accuracy of predictions [<xref ref-type="bibr" rid="ref4">4</xref>].</p>
        <p>Although performance of these techniques is demonstrably high, further research may be warranted to explore the extent to which they can improve the accuracy of classification algorithms in this context and at what cost (eg, computational burden or interpretability).</p>
        <p>Natural language processing algorithms and their related software, driven by market forces in the technology industry, are improving at a remarkable rate and, paradoxically given the economic motivation for their development, increasingly being distributed at no cost under open-source licenses. As the field develops, we may expect these sorts of algorithms to successfully classify more complex corpora and perhaps even identify important elements within open-text comments without task-specific training data, which is known as “unsupervised” machine learning.</p>
      </sec>
      <sec>
        <title>Limitations</title>
        <p>This study has some limitations. The average performance of our algorithms is likely to be high in some instances given the low incidence of category. For example, algorithm performance was exceptionally high (recall=.97) for the “innovator” category, where only 4.2% of doctors were rated as innovative. In this instance, the low number of classifications in the population would have meant that a “dumb guess” that simply rated each doctor as “not innovative” would have demonstrated a 95.8% agreement rate. However, while algorithm performance was somewhat lower for categories with balanced distributions (eg, 46% of doctors were rated as professional), it was still acceptably high (recall=.82, 10-fold accuracy=.87).</p>
        <p>Because of a somewhat small dataset, the trained ensemble was used to reclassify the whole corpora on which the algorithms were originally trained. It is likely that the performance of the algorithms will be higher when reclassifying the data they were originally trained on. The rationale for this decision was to maximize the number of classified categories and therefore maintain statistical power in the analyses. The results of these assessments were broadly the same as the results from the human-classified dataset, albeit the effect sizes were consistently smaller in the analyses using the machine-classified codes. This may be especially important as it appeared that the machine-labeled dataset was less sensitive to differences in doctor’s performance than the human-labeled dataset.</p>
        <p>A further possible limitation of the dataset was the necessity to have a small training-set to validation-set ratio (3:2) to keep sufficient number of comments in the validation sample. While this advantaged the statistical analysis of difference in performance signaling for different categories, it may have hampered the performance of certain algorithms by not providing sufficient training data.</p>
        <p>The significant positive skew in doctors’ ratings and the scarcity of comments that were outright negative meant that we were unable to conduct a sentiment analysis. We expect this to be the case in a population where most subjects are performing well, and this is probably representative of most datasets that collect open-text information on doctor performance. The content of uncategorized comments reveals a trend of doctors saying <italic>something</italic> positive about the colleagues if it did not relate to key elements of their medical practice (eg, “supportive colleague. Excellent time management skills”). The data used here were collected in relation to the high-stakes General Medical Council revalidation exam, which may introduce a barrier to honest reporting of negative aspects of a doctor’s practice. In addition to contextual factors, innovation surrounding the manner in which such comments are elicited, including less direct conversational techniques, may also reduce reporting biases.</p>
        <p>Although algorithm performance was generally high for each of the individual machine learning techniques, it is apparent that the generalized linear model with lasso regularization had the highest performance for each of the classes. The precise reason for this improved performance is somewhat opaque, but the lasso regularization technique is especially suitable for classification problems using sparse matrices [<xref ref-type="bibr" rid="ref34">34</xref>]. It is somewhat surprising that classification and regression trees outperformed the random forests; this may be attributed to the sparsity of the matrix and the low number of classifications made in some of the categories leading to high misclassification error in the random trees. Their performance may have been improved using a dimensional reduction technique such as singular value decomposition or latent Dirichlet allocation, which reduces sparsity in the matrix but which may also lead to a loss of information for other algorithms and uninterpretable results [<xref ref-type="bibr" rid="ref35">35</xref>,<xref ref-type="bibr" rid="ref36">36</xref>].</p>
        <p>Similarly, it is not immediately clear as to why certain codes could be classified with greater accuracy than others. The differences in performance between classes may be explained by differences in the conceptual basis of each class; both humans and algorithms may find it easier to classify comments that reflect easily defined concepts such as being “popular” (the class for which algorithm performance was highest), rather than less well-defined concepts such as being “interpersonal” (the class for which algorithm performance was lowest) [<xref ref-type="bibr" rid="ref37">37</xref>].</p>
        <p>There may be an opportunity for similar techniques to be applied to patient experience data to build algorithms that can correctly classify and perhaps, using sentiment analysis, quantify open text in national-scale patient experience surveys and provide feedback that is more meaningful to both patients and practitioners. Computational analysis of open-text comments may be of greater usefulness when it is used to identify issues that were not previously envisaged.</p>
      </sec>
      <sec>
        <title>Conclusions</title>
        <p>This study demonstrates excellent performance for an ensemble of machine learning algorithms tasked to classify open-text comments of doctors’ performance. These algorithms perform well, even where limited time and resources are available to code training datasets. We demonstrate that machine identification of qualitatively derived, theory-based open-text classifications can signpost significant differences in a doctor’s performance, even when comments are exclusively positive. These findings may inform future predictive models of performance and support real-time evaluation to improve quality and safety.</p>
      </sec>
    </sec>
  </body>
  <back>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">ANOVA</term>
          <def>
            <p>analysis of variance</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb2">GMC-CQ</term>
          <def>
            <p>General Medical Council Colleague Questionnaire</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb3">NIHR</term>
          <def>
            <p>National Institute for Health Research</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb4">SLDA</term>
          <def>
            <p>scaled linear discriminant analysis</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb5">SVM</term>
          <def>
            <p>support vector machine</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <ack>
      <p>We thank Karen Alexander, the National Institute for Health Research (NIHR) Adaptive Tests for Long-Term Conditions (ATLanTiC) patient and public involvement partner, for providing critical insight, comments, and editing the manuscript. Data collection and qualitative coding were funded by the UK General Medical Council as an unrestricted research award. Support for the novel work presented in this paper was given by a postdoctoral fellowship award for CG (NIHR-PDF-2014-07-028).</p>
    </ack>
    <fn-group>
      <fn fn-type="conflict">
        <p>JC has been an advisor to the UK General Medical Council and has received only direct costs associated with the presentation of this work.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="web">
        <source>NHSsurveys</source>  
        <comment>Inpatient Survey 2007: Analysis of the patients’ free text comments 
        <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://www.nhssurveys.org/Filestore/documents/Freetext_comments_Exec_Summary.pdf">http://www.nhssurveys.org/Filestore/documents/Freetext_comments_Exec_Summary.pdf</ext-link></comment> </nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Mnih</surname>
            <given-names>V</given-names>
          </name>
          <name name-style="western">
            <surname>Kavukcuoglu</surname>
            <given-names>K</given-names>
          </name>
          <name name-style="western">
            <surname>Silver</surname>
            <given-names>D</given-names>
          </name>
          <name name-style="western">
            <surname>Rusu</surname>
            <given-names>AA</given-names>
          </name>
          <name name-style="western">
            <surname>Veness</surname>
            <given-names>J</given-names>
          </name>
          <name name-style="western">
            <surname>Bellemare</surname>
            <given-names>MG</given-names>
          </name>
          <name name-style="western">
            <surname>Graves</surname>
            <given-names>A</given-names>
          </name>
          <name name-style="western">
            <surname>Riedmiller</surname>
            <given-names>M</given-names>
          </name>
          <name name-style="western">
            <surname>Fidjeland</surname>
            <given-names>AK</given-names>
          </name>
          <name name-style="western">
            <surname>Ostrovski</surname>
            <given-names>G</given-names>
          </name>
          <name name-style="western">
            <surname>Petersen</surname>
            <given-names>S</given-names>
          </name>
          <name name-style="western">
            <surname>Beattie</surname>
            <given-names>C</given-names>
          </name>
          <name name-style="western">
            <surname>Sadik</surname>
            <given-names>A</given-names>
          </name>
          <name name-style="western">
            <surname>Antonoglou</surname>
            <given-names>I</given-names>
          </name>
          <name name-style="western">
            <surname>King</surname>
            <given-names>H</given-names>
          </name>
          <name name-style="western">
            <surname>Kumaran</surname>
            <given-names>D</given-names>
          </name>
          <name name-style="western">
            <surname>Wierstra</surname>
            <given-names>D</given-names>
          </name>
          <name name-style="western">
            <surname>Legg</surname>
            <given-names>S</given-names>
          </name>
          <name name-style="western">
            <surname>Hassabis</surname>
            <given-names>D</given-names>
          </name>
        </person-group>
        <article-title>Human-level control through deep reinforcement learning</article-title>
        <source>Nature</source>  
        <year>2015</year>  
        <month>02</month>  
        <day>26</day>  
        <volume>518</volume>  
        <issue>7540</issue>  
        <fpage>529</fpage>  
        <lpage>33</lpage>  
        <pub-id pub-id-type="doi">10.1038/nature14236</pub-id>
        <pub-id pub-id-type="medline">25719670</pub-id>
        <pub-id pub-id-type="pii">nature14236</pub-id></nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Cambria</surname>
            <given-names>E</given-names>
          </name>
          <name name-style="western">
            <surname>White</surname>
            <given-names>B</given-names>
          </name>
        </person-group>
        <article-title>Jumping NLP curves: a review of natural language processing research [Review Article]</article-title>
        <source>IEEE Comput Intell Mag</source>  
        <year>2014</year>  
        <month>5</month>  
        <volume>9</volume>  
        <issue>2</issue>  
        <fpage>48</fpage>  
        <lpage>57</lpage>  
        <pub-id pub-id-type="doi">10.1109/MCI.2014.2307227</pub-id></nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Kosinski</surname>
            <given-names>M</given-names>
          </name>
          <name name-style="western">
            <surname>Stillwell</surname>
            <given-names>D</given-names>
          </name>
          <name name-style="western">
            <surname>Graepel</surname>
            <given-names>T</given-names>
          </name>
        </person-group>
        <article-title>Private traits and attributes are predictable from digital records of human behavior</article-title>
        <source>Proc Natl Acad Sci USA</source>  
        <year>2013</year>  
        <month>04</month>  
        <day>09</day>  
        <volume>110</volume>  
        <issue>15</issue>  
        <fpage>5802</fpage>  
        <lpage>5</lpage>  
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://www.pnas.org/cgi/pmidlookup?view=long&#38;pmid=23479631"/>
        </comment>  
        <pub-id pub-id-type="doi">10.1073/pnas.1218772110</pub-id>
        <pub-id pub-id-type="medline">23479631</pub-id>
        <pub-id pub-id-type="pii">1218772110</pub-id>
        <pub-id pub-id-type="pmcid">PMC3625324</pub-id></nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Youyou</surname>
            <given-names>W</given-names>
          </name>
          <name name-style="western">
            <surname>Kosinski</surname>
            <given-names>M</given-names>
          </name>
          <name name-style="western">
            <surname>Stillwell</surname>
            <given-names>D</given-names>
          </name>
        </person-group>
        <article-title>Computer-based personality judgments are more accurate than those made by humans</article-title>
        <source>Proc Natl Acad Sci USA</source>  
        <year>2015</year>  
        <month>01</month>  
        <day>27</day>  
        <volume>112</volume>  
        <issue>4</issue>  
        <fpage>1036</fpage>  
        <lpage>40</lpage>  
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://www.pnas.org/cgi/pmidlookup?view=long&#38;pmid=25583507"/>
        </comment>  
        <pub-id pub-id-type="doi">10.1073/pnas.1418680112</pub-id>
        <pub-id pub-id-type="medline">25583507</pub-id>
        <pub-id pub-id-type="pii">1418680112</pub-id>
        <pub-id pub-id-type="pmcid">PMC4313801</pub-id></nlm-citation>
      </ref>
      <ref id="ref6">
        <label>6</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Wagland</surname>
            <given-names>R</given-names>
          </name>
          <name name-style="western">
            <surname>Recio-Saucedo</surname>
            <given-names>A</given-names>
          </name>
          <name name-style="western">
            <surname>Simon</surname>
            <given-names>M</given-names>
          </name>
          <name name-style="western">
            <surname>Bracher</surname>
            <given-names>M</given-names>
          </name>
          <name name-style="western">
            <surname>Hunt</surname>
            <given-names>K</given-names>
          </name>
          <name name-style="western">
            <surname>Foster</surname>
            <given-names>C</given-names>
          </name>
          <name name-style="western">
            <surname>Downing</surname>
            <given-names>A</given-names>
          </name>
          <name name-style="western">
            <surname>Glaser</surname>
            <given-names>A</given-names>
          </name>
          <name name-style="western">
            <surname>Corner</surname>
            <given-names>J</given-names>
          </name>
        </person-group>
        <article-title>Development and testing of a text-mining approach to analyse patients' comments on their experiences of colorectal cancer care</article-title>
        <source>BMJ Qual Saf</source>  
        <year>2016</year>  
        <month>08</month>  
        <volume>25</volume>  
        <issue>8</issue>  
        <fpage>604</fpage>  
        <lpage>14</lpage>  
        <pub-id pub-id-type="doi">10.1136/bmjqs-2015-004063</pub-id>
        <pub-id pub-id-type="medline">26512131</pub-id>
        <pub-id pub-id-type="pii">bmjqs-2015-004063</pub-id></nlm-citation>
      </ref>
      <ref id="ref7">
        <label>7</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Ong</surname>
            <given-names>MS</given-names>
          </name>
          <name name-style="western">
            <surname>Magrabi</surname>
            <given-names>F</given-names>
          </name>
          <name name-style="western">
            <surname>Coiera</surname>
            <given-names>E</given-names>
          </name>
        </person-group>
        <article-title>Automated categorisation of clinical incident reports using statistical text classification</article-title>
        <source>Qual Saf Health Care</source>  
        <year>2010</year>  
        <month>12</month>  
        <volume>19</volume>  
        <issue>6</issue>  
        <fpage>e55</fpage>  
        <pub-id pub-id-type="doi">10.1136/qshc.2009.036657</pub-id>
        <pub-id pub-id-type="medline">20724392</pub-id>
        <pub-id pub-id-type="pii">qshc.2009.036657</pub-id></nlm-citation>
      </ref>
      <ref id="ref8">
        <label>8</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Ong</surname>
            <given-names>MS</given-names>
          </name>
          <name name-style="western">
            <surname>Magrabi</surname>
            <given-names>F</given-names>
          </name>
          <name name-style="western">
            <surname>Coiera</surname>
            <given-names>E</given-names>
          </name>
        </person-group>
        <article-title>Automated identification of extreme-risk events in clinical incident reports</article-title>
        <source>J Am Med Inform Assoc</source>  
        <year>2012</year>  
        <month>06</month>  
        <volume>19</volume>  
        <issue>e1</issue>  
        <fpage>e110</fpage>  
        <lpage>8</lpage>  
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/22237865"/>
        </comment>  
        <pub-id pub-id-type="doi">10.1136/amiajnl-2011-000562</pub-id>
        <pub-id pub-id-type="medline">22237865</pub-id>
        <pub-id pub-id-type="pii">amiajnl-2011-000562</pub-id>
        <pub-id pub-id-type="pmcid">PMC3392867</pub-id></nlm-citation>
      </ref>
      <ref id="ref9">
        <label>9</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Hawkins</surname>
            <given-names>JB</given-names>
          </name>
          <name name-style="western">
            <surname>Brownstein</surname>
            <given-names>JS</given-names>
          </name>
          <name name-style="western">
            <surname>Tuli</surname>
            <given-names>G</given-names>
          </name>
          <name name-style="western">
            <surname>Runels</surname>
            <given-names>T</given-names>
          </name>
          <name name-style="western">
            <surname>Broecker</surname>
            <given-names>K</given-names>
          </name>
          <name name-style="western">
            <surname>Nsoesie</surname>
            <given-names>EO</given-names>
          </name>
          <name name-style="western">
            <surname>McIver</surname>
            <given-names>DJ</given-names>
          </name>
          <name name-style="western">
            <surname>Rozenblum</surname>
            <given-names>R</given-names>
          </name>
          <name name-style="western">
            <surname>Wright</surname>
            <given-names>A</given-names>
          </name>
          <name name-style="western">
            <surname>Bourgeois</surname>
            <given-names>FT</given-names>
          </name>
          <name name-style="western">
            <surname>Greaves</surname>
            <given-names>F</given-names>
          </name>
        </person-group>
        <article-title>Measuring patient-perceived quality of care in US hospitals using Twitter</article-title>
        <source>BMJ Qual Saf</source>  
        <year>2016</year>  
        <month>06</month>  
        <volume>25</volume>  
        <issue>6</issue>  
        <fpage>404</fpage>  
        <lpage>13</lpage>  
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://qualitysafety.bmj.com/cgi/pmidlookup?view=long&#38;pmid=26464518"/>
        </comment>  
        <pub-id pub-id-type="doi">10.1136/bmjqs-2015-004309</pub-id>
        <pub-id pub-id-type="medline">26464518</pub-id>
        <pub-id pub-id-type="pii">bmjqs-2015-004309</pub-id>
        <pub-id pub-id-type="pmcid">PMC4878682</pub-id></nlm-citation>
      </ref>
      <ref id="ref10">
        <label>10</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Greaves</surname>
            <given-names>F</given-names>
          </name>
          <name name-style="western">
            <surname>Ramirez-Cano</surname>
            <given-names>D</given-names>
          </name>
          <name name-style="western">
            <surname>Millett</surname>
            <given-names>C</given-names>
          </name>
          <name name-style="western">
            <surname>Darzi</surname>
            <given-names>A</given-names>
          </name>
          <name name-style="western">
            <surname>Donaldson</surname>
            <given-names>L</given-names>
          </name>
        </person-group>
        <article-title>Use of sentiment analysis for capturing patient experience from free-text comments posted online</article-title>
        <source>J Med Internet Res</source>  
        <year>2013</year>  
        <month>11</month>  
        <day>01</day>  
        <volume>15</volume>  
        <issue>11</issue>  
        <fpage>e239</fpage>  
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://www.jmir.org/2013/11/e239/"/>
        </comment>  
        <pub-id pub-id-type="doi">10.2196/jmir.2721</pub-id>
        <pub-id pub-id-type="medline">24184993</pub-id>
        <pub-id pub-id-type="pii">v15i11e239</pub-id>
        <pub-id pub-id-type="pmcid">PMC3841376</pub-id></nlm-citation>
      </ref>
      <ref id="ref11">
        <label>11</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Campbell</surname>
            <given-names>JL</given-names>
          </name>
          <name name-style="western">
            <surname>Roberts</surname>
            <given-names>M</given-names>
          </name>
          <name name-style="western">
            <surname>Wright</surname>
            <given-names>C</given-names>
          </name>
          <name name-style="western">
            <surname>Hill</surname>
            <given-names>J</given-names>
          </name>
          <name name-style="western">
            <surname>Greco</surname>
            <given-names>M</given-names>
          </name>
          <name name-style="western">
            <surname>Taylor</surname>
            <given-names>M</given-names>
          </name>
          <name name-style="western">
            <surname>Richards</surname>
            <given-names>S</given-names>
          </name>
        </person-group>
        <article-title>Factors associated with variability in the assessment of UK doctors' professionalism: analysis of survey results</article-title>
        <source>Br Med J</source>  
        <year>2011</year>  
        <month>10</month>  
        <day>27</day>  
        <volume>343</volume>  
        <fpage>d6212</fpage>  
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://www.bmj.com/cgi/pmidlookup?view=long&#38;pmid=22034193"/>
        </comment>  
        <pub-id pub-id-type="medline">22034193</pub-id>
        <pub-id pub-id-type="pmcid">PMC3203200</pub-id></nlm-citation>
      </ref>
      <ref id="ref12">
        <label>12</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Richards</surname>
            <given-names>SH</given-names>
          </name>
          <name name-style="western">
            <surname>Campbell</surname>
            <given-names>JL</given-names>
          </name>
          <name name-style="western">
            <surname>Walshaw</surname>
            <given-names>E</given-names>
          </name>
          <name name-style="western">
            <surname>Dickens</surname>
            <given-names>A</given-names>
          </name>
          <name name-style="western">
            <surname>Greco</surname>
            <given-names>M</given-names>
          </name>
        </person-group>
        <article-title>A multi-method analysis of free-text comments from the UK General Medical Council Colleague Questionnaires</article-title>
        <source>Med Educ</source>  
        <year>2009</year>  
        <month>08</month>  
        <volume>43</volume>  
        <issue>8</issue>  
        <fpage>757</fpage>  
        <lpage>66</lpage>  
        <pub-id pub-id-type="doi">10.1111/j.1365-2923.2009.03416.x</pub-id>
        <pub-id pub-id-type="medline">19659489</pub-id>
        <pub-id pub-id-type="pii">MED3416</pub-id></nlm-citation>
      </ref>
      <ref id="ref13">
        <label>13</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Campbell</surname>
            <given-names>JL</given-names>
          </name>
          <name name-style="western">
            <surname>Richards</surname>
            <given-names>SH</given-names>
          </name>
          <name name-style="western">
            <surname>Dickens</surname>
            <given-names>A</given-names>
          </name>
          <name name-style="western">
            <surname>Greco</surname>
            <given-names>M</given-names>
          </name>
          <name name-style="western">
            <surname>Narayanan</surname>
            <given-names>A</given-names>
          </name>
          <name name-style="western">
            <surname>Brearley</surname>
            <given-names>S</given-names>
          </name>
        </person-group>
        <article-title>Assessing the professional performance of UK doctors: an evaluation of the utility of the General Medical Council patient and colleague questionnaires</article-title>
        <source>Qual Saf Health Care</source>  
        <year>2008</year>  
        <month>06</month>  
        <volume>17</volume>  
        <issue>3</issue>  
        <fpage>187</fpage>  
        <lpage>93</lpage>  
        <pub-id pub-id-type="doi">10.1136/qshc.2007.024679</pub-id>
        <pub-id pub-id-type="medline">18519625</pub-id>
        <pub-id pub-id-type="pii">17/3/187</pub-id></nlm-citation>
      </ref>
      <ref id="ref14">
        <label>14</label>
        <nlm-citation citation-type="book">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Holsti</surname>
            <given-names>O</given-names>
          </name>
        </person-group>
        <source>Content Analysis for the Social Sciences and Humanities</source>  
        <year>1969</year>  
        <publisher-loc>MA</publisher-loc>
        <publisher-name>Addison-Wesley</publisher-name></nlm-citation>
      </ref>
      <ref id="ref15">
        <label>15</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Meyer</surname>
            <given-names>D</given-names>
          </name>
          <name name-style="western">
            <surname>Hornik</surname>
            <given-names>K</given-names>
          </name>
          <name name-style="western">
            <surname>Feinerer</surname>
            <given-names>I</given-names>
          </name>
        </person-group>
        <article-title>Text mining infrastructure in R</article-title>
        <source>J Stat Softw</source>  
        <year>2008</year>  
        <volume>25</volume>  
        <issue>5</issue>  
        <fpage>-</fpage>  
        <pub-id pub-id-type="doi">10.18637/jss.v025.i05</pub-id></nlm-citation>
      </ref>
      <ref id="ref16">
        <label>16</label>
        <nlm-citation citation-type="web">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Hornik</surname>
            <given-names>E</given-names>
          </name>
          <name name-style="western">
            <surname>Leisch</surname>
            <given-names>F</given-names>
          </name>
          <name name-style="western">
            <surname>Meyer</surname>
            <given-names>D</given-names>
          </name>
          <name name-style="western">
            <surname>Weingessel</surname>
            <given-names>A</given-names>
          </name>
        </person-group>
        <source>Researchgate</source>  
        <year>2005</year>  
        <comment>E1071: Misc Functions of the Department of Statistics (E1071), TU Wien 
        <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="https://www.researchgate.net/publication/221678005_E1071_Misc_Functions_of_the_Department_of_Statistics_E1071_TU_Wien">https://www.researchgate.net/publication/221678005_E1071_Misc_Functions_of_the_Department_of_Statistics_E1071_TU_Wien</ext-link></comment> </nlm-citation>
      </ref>
      <ref id="ref17">
        <label>17</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Peters</surname>
            <given-names>A</given-names>
          </name>
          <name name-style="western">
            <surname>Hothorn</surname>
            <given-names>T</given-names>
          </name>
          <name name-style="western">
            <surname>Lausen</surname>
            <given-names>B</given-names>
          </name>
        </person-group>
        <article-title>ipred: Improved Predictors</article-title>
        <source>R News</source>  
        <year>2002</year>  
        <volume>2</volume>  
        <issue>2</issue>  
        <fpage>33</fpage>  
        <lpage>36</lpage> </nlm-citation>
      </ref>
      <ref id="ref18">
        <label>18</label>
        <nlm-citation citation-type="web">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Tuszynski</surname>
            <given-names>J</given-names>
          </name>
        </person-group>
        <source>Cran.r-project</source>  
        <year>2002</year>  
        <comment>Tools: moving window statistics, GIF, Base64, ROC AUC, etc 
        <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="https://cran.r-project.org/web/packages/caTools/caTools.pdf">https://cran.r-project.org/web/packages/caTools/caTools.pdf</ext-link></comment> </nlm-citation>
      </ref>
      <ref id="ref19">
        <label>19</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Liaw</surname>
            <given-names>A</given-names>
          </name>
          <name name-style="western">
            <surname>Wiener</surname>
            <given-names>M</given-names>
          </name>
        </person-group>
        <article-title>Classification and regression by randomForest</article-title>
        <source>R news</source>  
        <year>2002</year>  
        <volume>2</volume>  
        <issue>3</issue>  
        <fpage>18</fpage>  
        <lpage>22</lpage> </nlm-citation>
      </ref>
      <ref id="ref20">
        <label>20</label>
        <nlm-citation citation-type="web">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Ripley</surname>
            <given-names>B</given-names>
          </name>
        </person-group>
        <source>Cran.r-project</source>  
        <year>2012</year>  
        <comment>tree: Classification and regression trees 
        <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="https://cran.r-project.org/web/packages/tree/tree.pdf">https://cran.r-project.org/web/packages/tree/tree.pdf</ext-link></comment> </nlm-citation>
      </ref>
      <ref id="ref21">
        <label>21</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Jurka</surname>
            <given-names>T</given-names>
          </name>
          <name name-style="western">
            <surname>Collingwood</surname>
            <given-names>L</given-names>
          </name>
          <name name-style="western">
            <surname>Boydstun</surname>
            <given-names>AE</given-names>
          </name>
          <name name-style="western">
            <surname>Grossman</surname>
            <given-names>E</given-names>
          </name>
          <name name-style="western">
            <surname>Van Atteveldt</surname>
            <given-names>W</given-names>
          </name>
        </person-group>
        <article-title>RTextTools: A Supervised Learning Package for Text Classification</article-title>
        <source>The R Journal</source>  
        <year>2013</year>  
        <volume>5</volume>  
        <issue>1</issue>  
        <fpage>6</fpage>  
        <lpage>12</lpage> </nlm-citation>
      </ref>
      <ref id="ref22">
        <label>22</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Friedman</surname>
            <given-names>J</given-names>
          </name>
          <name name-style="western">
            <surname>Hastie</surname>
            <given-names>T</given-names>
          </name>
          <name name-style="western">
            <surname>Tibshirani</surname>
            <given-names>R</given-names>
          </name>
        </person-group>
        <article-title>Regularization paths for generalized linear models via coordinate descent</article-title>
        <source>J Stat Softw</source>  
        <year>2010</year>  
        <volume>33</volume>  
        <issue>1</issue>  
        <fpage>1</fpage>  
        <lpage>22</lpage>  
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/20808728"/>
        </comment>  
        <pub-id pub-id-type="medline">20808728</pub-id>
        <pub-id pub-id-type="pmcid">PMC2929880</pub-id></nlm-citation>
      </ref>
      <ref id="ref23">
        <label>23</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Kotsiantis</surname>
            <given-names>SB</given-names>
          </name>
          <name name-style="western">
            <surname>Zaharakis</surname>
            <given-names>ID</given-names>
          </name>
          <name name-style="western">
            <surname>Pintelas</surname>
            <given-names>PE</given-names>
          </name>
        </person-group>
        <article-title>Machine learning: a review of classification and combining techniques</article-title>
        <source>Artif Intell Rev</source>  
        <year>2007</year>  
        <month>11</month>  
        <day>10</day>  
        <volume>26</volume>  
        <issue>3</issue>  
        <fpage>159</fpage>  
        <lpage>190</lpage>  
        <pub-id pub-id-type="doi">10.1007/s10462-007-9052-3</pub-id></nlm-citation>
      </ref>
      <ref id="ref24">
        <label>24</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Sokolova</surname>
            <given-names>M</given-names>
          </name>
          <name name-style="western">
            <surname>Szpakowicz</surname>
            <given-names>S</given-names>
          </name>
        </person-group>
        <article-title>Beyond accuracy, F-score and ROC: a family of discriminant measures for performance evaluation</article-title>
        <source>Adv Artif Intell</source>  
        <year>2006</year>  
        <fpage>1015</fpage>  
        <lpage>1021</lpage>  
        <pub-id pub-id-type="doi">10.1007/11941439_114</pub-id></nlm-citation>
      </ref>
      <ref id="ref25">
        <label>25</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Kohavi</surname>
            <given-names>R</given-names>
          </name>
        </person-group>
        <article-title>A Study of Cross-Validation and Bootstrap for Accuracy Estimation and Model Selection</article-title>
        <source>IJCAI</source>  
        <year>1995</year>  
        <volume>1</volume>  
        <issue>2</issue>  
        <fpage>1137</fpage>  
        <lpage>1143</lpage> </nlm-citation>
      </ref>
      <ref id="ref26">
        <label>26</label>
        <nlm-citation citation-type="book">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Samejima</surname>
            <given-names>F</given-names>
          </name>
        </person-group>
        <article-title>Graded Response Model</article-title>
        <source>Handbook of Modern Item Response Theory</source>  
        <year>1997</year>  
        <publisher-loc>New York, NY</publisher-loc>
        <publisher-name>Springer</publisher-name>
        <fpage>95</fpage>  
        <lpage>107</lpage> </nlm-citation>
      </ref>
      <ref id="ref27">
        <label>27</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Reeve</surname>
            <given-names>BB</given-names>
          </name>
          <name name-style="western">
            <surname>Hays</surname>
            <given-names>RD</given-names>
          </name>
          <name name-style="western">
            <surname>Bjorner</surname>
            <given-names>JB</given-names>
          </name>
          <name name-style="western">
            <surname>Cook</surname>
            <given-names>KfF</given-names>
          </name>
          <name name-style="western">
            <surname>Crane</surname>
            <given-names>PK</given-names>
          </name>
          <name name-style="western">
            <surname>Teresi</surname>
            <given-names>JA</given-names>
          </name>
          <name name-style="western">
            <surname>Thissen</surname>
            <given-names>D</given-names>
          </name>
          <name name-style="western">
            <surname>Revicki</surname>
            <given-names>DA</given-names>
          </name>
          <name name-style="western">
            <surname>Weiss</surname>
            <given-names>DJ</given-names>
          </name>
          <name name-style="western">
            <surname>Hambleton</surname>
            <given-names>RK</given-names>
          </name>
          <name name-style="western">
            <surname>Liu</surname>
            <given-names>H</given-names>
          </name>
          <name name-style="western">
            <surname>Gershon</surname>
            <given-names>R</given-names>
          </name>
          <name name-style="western">
            <surname>Reise</surname>
            <given-names>SP</given-names>
          </name>
          <name name-style="western">
            <surname>Lai</surname>
            <given-names>J</given-names>
          </name>
          <name name-style="western">
            <surname>Cella</surname>
            <given-names>D</given-names>
          </name>
        </person-group>
        <article-title>Psychometric evaluation and calibration of health-related quality of life item banks</article-title>
        <source>Medical Care</source>  
        <year>2007</year>  
        <volume>45</volume>  
        <issue>Suppl 1</issue>  
        <fpage>S22</fpage>  
        <lpage>S31</lpage>  
        <pub-id pub-id-type="doi">10.1097/01.mlr.0000250483.85507.04</pub-id>
        <pub-id pub-id-type="medline">17443115</pub-id></nlm-citation>
      </ref>
      <ref id="ref28">
        <label>28</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Birbeck</surname>
            <given-names>GL</given-names>
          </name>
          <name name-style="western">
            <surname>Kim</surname>
            <given-names>S</given-names>
          </name>
          <name name-style="western">
            <surname>Hays</surname>
            <given-names>RD</given-names>
          </name>
          <name name-style="western">
            <surname>Vickrey</surname>
            <given-names>BG</given-names>
          </name>
        </person-group>
        <article-title>Quality of life measures in epilepsy: how well can they detect change over time?</article-title>
        <source>Neurology</source>  
        <year>2000</year>  
        <month>05</month>  
        <day>09</day>  
        <volume>54</volume>  
        <issue>9</issue>  
        <fpage>1822</fpage>  
        <lpage>7</lpage>  
        <pub-id pub-id-type="medline">10802791</pub-id></nlm-citation>
      </ref>
      <ref id="ref29">
        <label>29</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Bee</surname>
            <given-names>P</given-names>
          </name>
          <name name-style="western">
            <surname>Gibbons</surname>
            <given-names>C</given-names>
          </name>
          <name name-style="western">
            <surname>Callaghan</surname>
            <given-names>P</given-names>
          </name>
          <name name-style="western">
            <surname>Fraser</surname>
            <given-names>C</given-names>
          </name>
          <name name-style="western">
            <surname>Lovell</surname>
            <given-names>K</given-names>
          </name>
        </person-group>
        <article-title>Evaluating and quantifying user and carer involvement in mental health care planning (EQUIP): co-development of a new patient-reported outcome measure</article-title>
        <source>PLoS One</source>  
        <year>2016</year>  
        <volume>11</volume>  
        <issue>3</issue>  
        <fpage>e0149973</fpage>  
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://dx.plos.org/10.1371/journal.pone.0149973"/>
        </comment>  
        <pub-id pub-id-type="doi">10.1371/journal.pone.0149973</pub-id>
        <pub-id pub-id-type="medline">26963252</pub-id>
        <pub-id pub-id-type="pii">PONE-D-15-26884</pub-id>
        <pub-id pub-id-type="pmcid">PMC4786101</pub-id></nlm-citation>
      </ref>
      <ref id="ref30">
        <label>30</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Gibbons</surname>
            <given-names>C</given-names>
          </name>
          <name name-style="western">
            <surname>Bower</surname>
            <given-names>P</given-names>
          </name>
          <name name-style="western">
            <surname>Lovell</surname>
            <given-names>K</given-names>
          </name>
          <name name-style="western">
            <surname>Valderas</surname>
            <given-names>J</given-names>
          </name>
          <name name-style="western">
            <surname>Skevington</surname>
            <given-names>S</given-names>
          </name>
        </person-group>
        <article-title>Electronic quality of life assessment using computer-adaptive testing</article-title>
        <source>J Med Internet Res</source>  
        <year>2016</year>  
        <month>09</month>  
        <day>30</day>  
        <volume>18</volume>  
        <issue>9</issue>  
        <fpage>e240</fpage>  
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://www.jmir.org/2016/9/e240/"/>
        </comment>  
        <pub-id pub-id-type="doi">10.2196/jmir.6053</pub-id>
        <pub-id pub-id-type="medline">27694100</pub-id>
        <pub-id pub-id-type="pii">v18i9e240</pub-id>
        <pub-id pub-id-type="pmcid">PMC5065679</pub-id></nlm-citation>
      </ref>
      <ref id="ref31">
        <label>31</label>
        <nlm-citation citation-type="web">
        <person-group person-group-type="author">
          <collab>R Core Team</collab>
        </person-group>
        <source>Gbif</source>  
        <year>2014</year>  
        <comment>R: A language environment for statistical computing (Version 3.0.2) 
        <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://www.gbif.org/resource/81287">http://www.gbif.org/resource/81287</ext-link></comment> </nlm-citation>
      </ref>
      <ref id="ref32">
        <label>32</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Corner</surname>
            <given-names>J</given-names>
          </name>
          <name name-style="western">
            <surname>Wagland</surname>
            <given-names>R</given-names>
          </name>
          <name name-style="western">
            <surname>Glaser</surname>
            <given-names>A</given-names>
          </name>
          <name name-style="western">
            <surname>Richards</surname>
            <given-names>SM</given-names>
          </name>
        </person-group>
        <article-title>Qualitative analysis of patients' feedback from a PROMs survey of cancer patients in England</article-title>
        <source>BMJ Open</source>  
        <year>2013</year>  
        <volume>3</volume>  
        <issue>4</issue>  
        <fpage>e002316</fpage>  
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://bmjopen.bmj.com/cgi/pmidlookup?view=long&#38;pmid=23578681"/>
        </comment>  
        <pub-id pub-id-type="doi">10.1136/bmjopen-2012-002316</pub-id>
        <pub-id pub-id-type="medline">23578681</pub-id>
        <pub-id pub-id-type="pii">bmjopen-2012-002316</pub-id>
        <pub-id pub-id-type="pmcid">PMC3641435</pub-id></nlm-citation>
      </ref>
      <ref id="ref33">
        <label>33</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Ainsworth</surname>
            <given-names>J</given-names>
          </name>
          <name name-style="western">
            <surname>Buchan</surname>
            <given-names>I</given-names>
          </name>
        </person-group>
        <article-title>Combining health data uses to ignite health system learning</article-title>
        <source>Methods Inf Med</source>  
        <year>2015</year>  
        <volume>54</volume>  
        <issue>6</issue>  
        <fpage>479</fpage>  
        <lpage>87</lpage>  
        <pub-id pub-id-type="doi">10.3414/ME15-01-0064</pub-id>
        <pub-id pub-id-type="medline">26395036</pub-id>
        <pub-id pub-id-type="pii">15-01-0064</pub-id></nlm-citation>
      </ref>
      <ref id="ref34">
        <label>34</label>
        <nlm-citation citation-type="book">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Hastie</surname>
            <given-names>T</given-names>
          </name>
          <name name-style="western">
            <surname>Tibshirani</surname>
            <given-names>R</given-names>
          </name>
          <name name-style="western">
            <surname>Friedman</surname>
            <given-names>J</given-names>
          </name>
        </person-group>
        <source>Elements of statistical learning</source>  
        <year>2001</year>  
        <publisher-loc>New York</publisher-loc>
        <publisher-name>Springer-Verlag</publisher-name></nlm-citation>
      </ref>
      <ref id="ref35">
        <label>35</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Golub</surname>
            <given-names>GH</given-names>
          </name>
          <name name-style="western">
            <surname>Reinsch</surname>
            <given-names>C</given-names>
          </name>
        </person-group>
        <article-title>Singular value decomposition and least squares solutions</article-title>
        <source>Numer Math</source>  
        <year>1970</year>  
        <month>4</month>  
        <volume>14</volume>  
        <issue>5</issue>  
        <fpage>403</fpage>  
        <lpage>420</lpage>  
        <pub-id pub-id-type="doi">10.1007/BF02163027</pub-id></nlm-citation>
      </ref>
      <ref id="ref36">
        <label>36</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Blei</surname>
            <given-names>D</given-names>
          </name>
          <name name-style="western">
            <surname>Ng</surname>
            <given-names>AY</given-names>
          </name>
          <name name-style="western">
            <surname>Jordan</surname>
            <given-names>I</given-names>
          </name>
        </person-group>
        <article-title>Latent Dirichlet Allocation</article-title>
        <source>J Mach Learn Res</source>  
        <year>2003</year>  
        <volume>3</volume>  
        <fpage>993</fpage>  
        <lpage>1022</lpage> </nlm-citation>
      </ref>
      <ref id="ref37">
        <label>37</label>
        <nlm-citation citation-type="book">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Efron</surname>
            <given-names>B</given-names>
          </name>
          <name name-style="western">
            <surname>Hastie</surname>
            <given-names>T</given-names>
          </name>
        </person-group>
        <source>Computer Age Statistical Inference. 1st ed</source>  
        <year>2016</year>  
        <publisher-loc>Cambridge</publisher-loc>
        <publisher-name>Cambridge University Press</publisher-name>
        <fpage>3</fpage>  
        <lpage>11</lpage> </nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
