<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article article-type="research-article" dtd-version="2.0" xmlns:xlink="http://www.w3.org/1999/xlink">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JMIR</journal-id>
      <journal-id journal-id-type="nlm-ta">J Med Internet Res</journal-id>
      <journal-title>Journal of Medical Internet Research</journal-title>
      <issn pub-type="epub">1438-8871</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
      <article-id pub-id-type="publisher-id">v26i1e50236</article-id>
      <article-id pub-id-type="pmid">39088259</article-id>
      <article-id pub-id-type="doi">10.2196/50236</article-id>
      <article-categories>
        <subj-group subj-group-type="heading">
          <subject>Original Paper</subject>
        </subj-group>
        <subj-group subj-group-type="article-type">
          <subject>Original Paper</subject>
        </subj-group>
      </article-categories>
      <title-group>
        <article-title>Classification of Patients’ Judgments of Their Physicians in Web-Based Written Reviews Using Natural Language Processing: Algorithm Development and Validation</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="editor">
          <name>
            <surname>Zhuang</surname>
            <given-names>Yan</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Senst</surname>
            <given-names>Benjamin</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Elbattah</surname>
            <given-names>Mahmoud</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Tomar</surname>
            <given-names>Ayush</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Huang</surname>
            <given-names>Taicheng</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib id="contrib1" contrib-type="author" corresp="yes">
          <name name-style="western">
            <surname>Madanay</surname>
            <given-names>Farrah</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <xref rid="aff2" ref-type="aff">2</xref>
          <address>
            <institution>Center for Bioethics and Social Sciences in Medicine</institution>
            <institution>University of Michigan Medical School</institution>
            <addr-line>2800 Plymouth Rd</addr-line>
            <addr-line>Bldg 14, G016</addr-line>
            <addr-line>Ann Arbor, MI, 48109</addr-line>
            <country>United States</country>
            <phone>1 8083524196</phone>
            <email>madanafl@med.umich.edu</email>
          </address>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-6660-7671</ext-link>
        </contrib>
        <contrib id="contrib2" contrib-type="author">
          <name name-style="western">
            <surname>Tu</surname>
            <given-names>Karissa</given-names>
          </name>
          <degrees>BS</degrees>
          <xref rid="aff3" ref-type="aff">3</xref>
          <xref rid="aff4" ref-type="aff">4</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-4822-9501</ext-link>
        </contrib>
        <contrib id="contrib3" contrib-type="author">
          <name name-style="western">
            <surname>Campagna</surname>
            <given-names>Ada</given-names>
          </name>
          <degrees>MA</degrees>
          <xref rid="aff5" ref-type="aff">5</xref>
          <xref rid="aff6" ref-type="aff">6</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0009-0006-9359-8508</ext-link>
        </contrib>
        <contrib id="contrib4" contrib-type="author">
          <name name-style="western">
            <surname>Davis</surname>
            <given-names>J Kelly</given-names>
          </name>
          <degrees>BA</degrees>
          <xref rid="aff3" ref-type="aff">3</xref>
          <xref rid="aff7" ref-type="aff">7</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-9586-923X</ext-link>
        </contrib>
        <contrib id="contrib5" contrib-type="author">
          <name name-style="western">
            <surname>Doerstling</surname>
            <given-names>Steven S</given-names>
          </name>
          <degrees>MD</degrees>
          <xref rid="aff8" ref-type="aff">8</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-2899-2470</ext-link>
        </contrib>
        <contrib id="contrib6" contrib-type="author">
          <name name-style="western">
            <surname>Chen</surname>
            <given-names>Felicia</given-names>
          </name>
          <degrees>BS</degrees>
          <xref rid="aff9" ref-type="aff">9</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-6329-1365</ext-link>
        </contrib>
        <contrib id="contrib7" contrib-type="author">
          <name name-style="western">
            <surname>Ubel</surname>
            <given-names>Peter A</given-names>
          </name>
          <degrees>MD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <xref rid="aff3" ref-type="aff">3</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-8218-519X</ext-link>
        </contrib>
      </contrib-group>
      <aff id="aff1">
        <label>1</label>
        <institution>Sanford School of Public Policy</institution>
        <institution>Duke University</institution>
        <addr-line>Durham, NC</addr-line>
        <country>United States</country>
      </aff>
      <aff id="aff2">
        <label>2</label>
        <institution>Center for Bioethics and Social Sciences in Medicine</institution>
        <institution>University of Michigan Medical School</institution>
        <addr-line>Ann Arbor, MI</addr-line>
        <country>United States</country>
      </aff>
      <aff id="aff3">
        <label>3</label>
        <institution>Fuqua School of Business</institution>
        <institution>Duke University</institution>
        <addr-line>Durham, NC</addr-line>
        <country>United States</country>
      </aff>
      <aff id="aff4">
        <label>4</label>
        <institution>School of Medicine</institution>
        <institution>University of Washington</institution>
        <addr-line>Seattle, WA</addr-line>
        <country>United States</country>
      </aff>
      <aff id="aff5">
        <label>5</label>
        <institution>Center for Advanced Hindsight</institution>
        <institution>Duke University</institution>
        <addr-line>Durham, NC</addr-line>
        <country>United States</country>
      </aff>
      <aff id="aff6">
        <label>6</label>
        <institution>Department of Sociology</institution>
        <institution>University of California, Los Angeles</institution>
        <addr-line>Los Angeles, CA</addr-line>
        <country>United States</country>
      </aff>
      <aff id="aff7">
        <label>7</label>
        <institution>Department of Population Health Sciences</institution>
        <institution>Duke University School of Medicine</institution>
        <addr-line>Durham, NC</addr-line>
        <country>United States</country>
      </aff>
      <aff id="aff8">
        <label>8</label>
        <institution>Department of Medicine</institution>
        <institution>Stanford University</institution>
        <addr-line>Stanford, CA</addr-line>
        <country>United States</country>
      </aff>
      <aff id="aff9">
        <label>9</label>
        <institution>GrantScout</institution>
        <addr-line>San Francisco, CA</addr-line>
        <country>United States</country>
      </aff>
      <author-notes>
        <corresp>Corresponding Author: Farrah Madanay <email>madanafl@med.umich.edu</email></corresp>
      </author-notes>
      <pub-date pub-type="collection">
        <year>2024</year>
      </pub-date>
      <pub-date pub-type="epub">
        <day>1</day>
        <month>8</month>
        <year>2024</year>
      </pub-date>
      <volume>26</volume>
      <elocation-id>e50236</elocation-id>
      <history>
        <date date-type="received">
          <day>28</day>
          <month>6</month>
          <year>2023</year>
        </date>
        <date date-type="rev-request">
          <day>26</day>
          <month>1</month>
          <year>2024</year>
        </date>
        <date date-type="rev-recd">
          <day>21</day>
          <month>3</month>
          <year>2024</year>
        </date>
        <date date-type="accepted">
          <day>29</day>
          <month>4</month>
          <year>2024</year>
        </date>
      </history>
      <copyright-statement>©Farrah Madanay, Karissa Tu, Ada Campagna, J Kelly Davis, Steven S Doerstling, Felicia Chen, Peter A Ubel. Originally published in the Journal of Medical Internet Research (https://www.jmir.org), 01.08.2024.</copyright-statement>
      <copyright-year>2024</copyright-year>
      <license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
        <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (https://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in the Journal of Medical Internet Research (ISSN 1438-8871), is properly cited. The complete bibliographic information, a link to the original publication on https://www.jmir.org/, as well as this copyright and license information must be included.</p>
      </license>
      <self-uri xlink:href="https://www.jmir.org/2024/1/e50236" xlink:type="simple"/>
      <abstract>
        <sec sec-type="background">
          <title>Background</title>
          <p>Patients increasingly rely on web-based physician reviews to choose a physician and share their experiences. However, the unstructured text of these written reviews presents a challenge for researchers seeking to make inferences about patients’ judgments. Methods previously used to identify patient judgments within reviews, such as hand-coding and dictionary-based approaches, have posed limitations to sample size and classification accuracy. Advanced natural language processing methods can help overcome these limitations and promote further analysis of physician reviews on these popular platforms.</p>
        </sec>
        <sec sec-type="objective">
          <title>Objective</title>
          <p>This study aims to train, test, and validate an advanced natural language processing algorithm for classifying the presence and valence of 2 dimensions of patient judgments in web-based physician reviews: interpersonal manner and technical competence.</p>
        </sec>
        <sec sec-type="methods">
          <title>Methods</title>
          <p>We sampled 345,053 reviews for 167,150 physicians across the United States from Healthgrades.com, a commercial web-based physician rating and review website. We hand-coded 2000 written reviews and used those reviews to train and test a transformer classification algorithm called the Robustly Optimized BERT (Bidirectional Encoder Representations from Transformers) Pretraining Approach (RoBERTa). The 2 fine-tuned models coded the reviews for the presence and positive or negative valence of patients’ interpersonal manner or technical competence judgments of their physicians. We evaluated the performance of the 2 models against 200 hand-coded reviews and validated the models using the full sample of 345,053 RoBERTa-coded reviews.</p>
        </sec>
        <sec sec-type="results">
          <title>Results</title>
          <p>The interpersonal manner model was 90% accurate with precision of 0.89, recall of 0.90, and weighted <italic>F</italic><sub>1</sub>-score of 0.89. The technical competence model was 90% accurate with precision of 0.91, recall of 0.90, and weighted <italic>F</italic><sub>1</sub>-score of 0.90. Positive-valence judgments were associated with higher review star ratings whereas negative-valence judgments were associated with lower star ratings. Analysis of the data by review rating and physician gender corresponded with findings in prior literature.</p>
        </sec>
        <sec sec-type="conclusions">
          <title>Conclusions</title>
          <p>Our 2 classification models coded interpersonal manner and technical competence judgments with high precision, recall, and accuracy. These models were validated using review star ratings and results from previous research. RoBERTa can accurately classify unstructured, web-based review text at scale. Future work could explore the use of this algorithm with other textual data, such as social media posts and electronic health records.</p>
        </sec>
      </abstract>
      <kwd-group>
        <kwd>web-based physician reviews</kwd>
        <kwd>patient judgments</kwd>
        <kwd>RoBERTa</kwd>
        <kwd>natural language processing</kwd>
        <kwd>text classification</kwd>
        <kwd>machine learning</kwd>
        <kwd>patient experience</kwd>
        <kwd>patient-authored reviews</kwd>
        <kwd>healthcare quality</kwd>
        <kwd>patient care</kwd>
        <kwd>psychology</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <sec sec-type="introduction">
      <title>Introduction</title>
      <p>Patients increasingly turn to commercial physician rating and review websites to discuss their patient experiences and provide feedback to hospitals and providers [<xref ref-type="bibr" rid="ref1">1</xref>,<xref ref-type="bibr" rid="ref2">2</xref>]. Patient-authored reviews on these websites may capture factors of the patient experience not otherwise found in traditional patient experience surveys (eg, Press Ganey) or academic research (eg, interviews and questionnaires), such as insurance processing and appointment scheduling [<xref ref-type="bibr" rid="ref3">3</xref>,<xref ref-type="bibr" rid="ref4">4</xref>]. These websites have therefore gained increased attention among researchers seeking to better understand what patients care about and how commercial review data compare to other health care quality measures. For example, researchers analyzing commercial hospital reviews identified topics discussed by patients that were not covered in the current Hospital Consumer Assessment of Healthcare Providers and Systems survey, like nurse quality, staff compassion, and the technical aspects of care [<xref ref-type="bibr" rid="ref3">3</xref>]. Other researchers found negative commercial reviews of surgeons focused on surgeon-independent factors, such as wait times and office staff, suggesting patients may consider factors beyond the patient-physician interaction when assessing quality [<xref ref-type="bibr" rid="ref5">5</xref>].</p>
      <p>Physician review websites potentially impact both patient choice and physician care quality. Some prospective patients rely on web-based physician ratings and reviews to help them choose physicians [<xref ref-type="bibr" rid="ref4">4</xref>,<xref ref-type="bibr" rid="ref6">6</xref>,<xref ref-type="bibr" rid="ref7">7</xref>]. Research shows people prefer words to numbers and can easily comprehend review narratives over quantitative ratings [<xref ref-type="bibr" rid="ref8">8</xref>]. Additionally, physicians use patient feedback conveyed in web-based written reviews to implement and improve quality measures, particularly related to patient communication [<xref ref-type="bibr" rid="ref8">8</xref>].</p>
      <p>The unstructured narrative text, however, presents a challenge for researchers seeking to make inferences from physician reviews. Methods previously used to identify patient judgments within written reviews include hand-coding [<xref ref-type="bibr" rid="ref1">1</xref>] and dictionary-based approaches [<xref ref-type="bibr" rid="ref9">9</xref>,<xref ref-type="bibr" rid="ref10">10</xref>]. Hand-coding approaches, however, are time- and resource-intensive, which limits sample size [<xref ref-type="bibr" rid="ref11">11</xref>]. Likewise, dictionary-based methods, such as Linguistic Inquiry and Word Count, use a context-independent bag-of-words approach, which may overlook misspellings, colloquialisms, and keywords and phrases not captured in prebuilt dictionaries [<xref ref-type="bibr" rid="ref12">12</xref>,<xref ref-type="bibr" rid="ref13">13</xref>].</p>
      <p>In this paper, we present measures of precision, recall, and accuracy for an advanced natural language processing (NLP) algorithm, fine-tuned to identify the presence and valence of 2 dimensions of patient judgments in web-based physician reviews: interpersonal manner and technical competence. We use an algorithm called the Robustly Optimized BERT (Bidirectional Encoder Representations from Transformers) Pretraining Approach (RoBERTa), which we trained to classify our 2 judgment dimensions in written reviews and which has been successfully applied in other classification contexts (eg, Twitter) [<xref ref-type="bibr" rid="ref13">13</xref>-<xref ref-type="bibr" rid="ref15">15</xref>]. RoBERTa’s novelty is in its transformer-based, bidirectional, context-aware approach, wherein it is pretrained on a large corpus of text but is fine-tunable for many NLP tasks, including text classification [<xref ref-type="bibr" rid="ref15">15</xref>,<xref ref-type="bibr" rid="ref16">16</xref>]. We validate this algorithm by correlating results with review star ratings and by comparing results with those found in prior literature.</p>
    </sec>
    <sec sec-type="methods">
      <title>Methods</title>
      <sec>
        <title>Data Collection</title>
        <p>We scraped physician profiles, ratings, and review data published on Healthgrades.com in April 2020. We collected primary care physician profiles associated with family medicine, internal medicine, and pediatrics, and surgeon profiles associated with general surgery; orthopedic surgery; and cosmetic, plastic, and reconstructive surgery. Healthgrades.com has a physician profile for every US physician with an active profile listed on the National Provider Identifier Registry [<xref ref-type="bibr" rid="ref17">17</xref>]. In addition to physician profile characteristics, we scraped rating information and up to 20 of the most recent written reviews per physician. On Healthgrades.com, patients can elect to submit a star rating alone (ie, 1-5 stars, no fractions) or a star rating accompanied by a written review. The study was approved by the Duke University institutional review board and all data collected were publicly available and aggregated for research purposes. Our final sample included 345,053 reviews submitted for 167,150 physicians (primary care physicians and surgeons). <xref rid="figure1" ref-type="fig">Figure 1</xref> shows a flow chart of our sample selection.</p>
        <fig id="figure1" position="float">
          <label>Figure 1</label>
          <caption>
            <p>Sample selection flow chart.</p>
          </caption>
          <graphic xlink:href="jmir_v26i1e50236_fig1.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Coding Reviews for Interpersonal Manner and Technical Competence</title>
        <sec>
          <title>Hand-Coding the Training Data</title>
          <p>We first cleaned the text to convert non–ASCII-encoded characters to ASCII characters; examples included some apostrophes, dashes, and letters with accents (eg, blasé). We then trained our classification algorithm using a gold standard data set of rigorously hand-coded physician reviews [<xref ref-type="bibr" rid="ref11">11</xref>]. We purposely sampled 2000 random reviews for equal representation of primary care physicians and surgeons, female and male physicians, and low-star (≤3 stars) and high-star (≥4 stars) review ratings. We achieved high interrater reliability with a subset of 300 double-coded reviews (Cohen κ range 0.74-0.85), before proceeding to independently code the remaining reviews.</p>
          <p>We coded each review for the presence or absence of interpersonal manner and technical competence. Reviews could be coded for the presence of only 1 dimension, both dimensions, and neither dimension. Once we indicated the presence of a judgment dimension, we coded the valence of the judgment as positive or negative. If we did not code the presence of a judgment dimension, we would not have a valence indicated for that judgment. <xref rid="figure2" ref-type="fig">Figure 2</xref> provides a diagram with illustrative examples showing how we hand-coded the presence and valence of the 2 judgment dimensions.</p>
          <fig id="figure2" position="float">
            <label>Figure 2</label>
            <caption>
              <p>Diagram showing how real physician reviews were hand-coded for the presence and valence of interpersonal manner and technical competence.</p>
            </caption>
            <graphic xlink:href="jmir_v26i1e50236_fig2.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </fig>
        </sec>
        <sec>
          <title>Training and Testing the Algorithm</title>
          <p>To code patients’ interpersonal manner and technical competence judgments in the sample of 345,053 reviews, we first used our hand-coded data to train RoBERTa, a transformer classification model. Transformers are neural network systems that use vectors to capture the meanings of words in context and are the main architecture underlying advanced NLP models [<xref ref-type="bibr" rid="ref13">13</xref>,<xref ref-type="bibr" rid="ref18">18</xref>]. These state-of-the-art NLP models improve upon prior NLP classification approaches, such as those based on dictionaries or fixed embeddings [<xref ref-type="bibr" rid="ref13">13</xref>]. Specifically, RoBERTa builds bidirectional context-aware embeddings such that the vector representing the word changes depending on its context in the text [<xref ref-type="bibr" rid="ref13">13</xref>,<xref ref-type="bibr" rid="ref19">19</xref>]. RoBERTa is pretrained on Book Corpus (800 million words) and English Wikipedia (2500 million words), and can be fine-tuned with one additional level of training data for specific classification tasks [<xref ref-type="bibr" rid="ref20">20</xref>]. We implemented RoBERTa with assistance from simple transformers [<xref ref-type="bibr" rid="ref21">21</xref>], a wrapper library for the HuggingFace Transformers library [<xref ref-type="bibr" rid="ref22">22</xref>]. In our study, each review used to train RoBERTa has its own sequence embedding and helps fine-tune the model to code the reviews for the presence and valence of interpersonal manner and technical competence.</p>
          <p>We fine-tuned 2 multiclass classification models, 1 for classifying interpersonal manner and 1 for technical competence. We tuned each model using 1600 (80%) reviews randomly sampled from our hand-coded training data. We completed 6 iterations through our training data for both models. We used a test data set, or a set of 200 (10%) hand-coded reviews held out of the training data, to evaluate each model’s fit on the training data set while further fine-tuning the model. Finally, we used a new set of 200 (10%) hand-coded reviews to provide an unbiased evaluation of the classification performance of each fully trained model for patients’ interpersonal manner and technical competence judgments. Training and evaluation batch sizes for both models were 1024 sequences; both models used 6 training epochs, with final epoch running losses &lt;0.01.</p>
          <p>After training and testing the 2 models using the 2000 hand-coded reviews, we applied the fully trained models to all the reviews in our data set, including the 2000 reviews we hand-coded. This resulted in a data set with 345,053 reviews coded by RoBERTa for the presence and valence of interpersonal manner and technical competence judgments. We used Python and Google Colab to train RoBERTa on our judgment classification tasks and code our full sample of reviews.</p>
        </sec>
      </sec>
    </sec>
    <sec sec-type="results">
      <title>Results</title>
      <sec>
        <title>Evaluating the Accuracy of the 2 Classification Models</title>
        <p>Our 2 classification models were highly accurate. The out-of-sample predictive accuracy for the interpersonal manner model was 90% with a weighted <italic>F</italic><sub>1</sub>-score of 0.89 (range 0.82-0.95), precision of 0.89 (range 0.85-0.94), and recall of 0.90 (range 0.80-0.96). The out-of-sample predictive accuracy for the technical competence model was also 90%, with a weighted <italic>F</italic><sub>1</sub>-score of 0.90 (range 0.90-0.92), precision of 0.91 (range 0.88-0.95), and recall of 0.90 (range 0.85-0.95). <xref ref-type="table" rid="table1">Table 1</xref> details the classification performance metrics for the interpersonal manner and technical competence models.</p>
        <table-wrap position="float" id="table1">
          <label>Table 1</label>
          <caption>
            <p>Fine-tuned transformer classification performance for interpersonal manner and technical competence judgments.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="30"/>
            <col width="470"/>
            <col width="0"/>
            <col width="200"/>
            <col width="0"/>
            <col width="180"/>
            <col width="0"/>
            <col width="120"/>
            <thead>
              <tr valign="bottom">
                <td colspan="3">Classification model and valence<sup>a</sup></td>
                <td colspan="2">Precision<sup>b</sup></td>
                <td colspan="2">Recall<sup>c</sup></td>
                <td><italic>F</italic><sub>1</sub>-score<sup>d</sup></td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td colspan="8">
                  <bold>Interpersonal manner model</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>No interpersonal manner</td>
                <td colspan="2">0.85</td>
                <td colspan="2">0.80</td>
                <td colspan="2">0.82</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Negative interpersonal manner</td>
                <td colspan="2">0.88</td>
                <td colspan="2">0.89</td>
                <td colspan="2">0.88</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Positive interpersonal manner</td>
                <td colspan="2">0.94</td>
                <td colspan="2">0.96</td>
                <td colspan="2">0.95</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Accuracy</td>
                <td colspan="2">—<sup>e</sup></td>
                <td colspan="2">—</td>
                <td colspan="2">0.90</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Macro avg</td>
                <td colspan="2">0.89</td>
                <td colspan="2">0.88</td>
                <td colspan="2">0.88</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Weighted average</td>
                <td colspan="2">0.89</td>
                <td colspan="2">0.90</td>
                <td colspan="2">0.89</td>
              </tr>
              <tr valign="top">
                <td colspan="8">
                  <bold>Technical competence model</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>No technical competence</td>
                <td colspan="2">0.88</td>
                <td colspan="2">0.91</td>
                <td colspan="2">0.90</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Negative technical competence</td>
                <td colspan="2">0.95</td>
                <td colspan="2">0.85</td>
                <td colspan="2">0.90</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Positive technical competence</td>
                <td colspan="2">0.89</td>
                <td colspan="2">0.95</td>
                <td colspan="2">0.92</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Accuracy</td>
                <td colspan="2">—</td>
                <td colspan="2">—</td>
                <td colspan="2">0.90</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Macro average</td>
                <td colspan="2">0.91</td>
                <td colspan="2">0.90</td>
                <td colspan="2">0.90</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Weighted average</td>
                <td colspan="2">0.91</td>
                <td colspan="2">0.90</td>
                <td colspan="2">0.90</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table1fn1">
              <p><sup>a</sup>Classification performance is based on a comparison to an evaluation data set of 200 reviews hand-coded by our team of researchers.</p>
            </fn>
            <fn id="table1fn2">
              <p><sup>b</sup>Precision: number of true positives divided by the sum of true positives and false positives.</p>
            </fn>
            <fn id="table1fn3">
              <p><sup>c</sup>Recall: number of true positives divided by the sum of true positives and false negatives.</p>
            </fn>
            <fn id="table1fn4">
              <p><sup>d</sup><italic>F</italic><sub>1</sub>-score: harmonic mean of precision and recall, given by <inline-graphic xlink:href="jmir_v26i1e50236_fig4.png" xlink:type="simple" mimetype="image"/> [<xref ref-type="bibr" rid="ref14">14</xref>].</p>
            </fn>
            <fn id="table1fn5">
              <p><sup>e</sup>Not applicable.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
      </sec>
      <sec>
        <title>Comparing Reviews Coded by RoBERTa and by Hand</title>
        <p>As part of our final sample of 345,053 reviews, the 2000 hand-coded reviews from our training data set were recoded by RoBERTa. The interrater reliability between our hand-coding and RoBERTa was Cohen κ =0.96 for both interpersonal manner and technical competence. Comparing the RoBERTa codes with the original hand codes for these reviews, we found only 107 (5.4%) reviews had coding discrepancies. Of those, 49 (2.5%) reviews had the same interpersonal manner code but different technical competence code, 57 (2.9%) reviews had the same technical competence code but different interpersonal manner code, and 1 (0.05%) review had both different interpersonal manner and technical competence codes. <xref ref-type="table" rid="table2">Table 2</xref> shows illustrative examples of coding discrepancies in our RoBERTa-coded reviews and our hand-coded reviews from the training data set.</p>
        <table-wrap position="float" id="table2">
          <label>Table 2</label>
          <caption>
            <p>Illustrative examples of discrepancies in reviews coded by the RoBERTa<sup>a</sup> and by hand and reasoning underlying the discrepancies.</p>
          </caption>
          <table border="1" rules="groups" cellpadding="5" frame="hsides" width="1000" cellspacing="0">
            <col width="200"/>
            <col width="200"/>
            <col width="200"/>
            <col width="200"/>
            <col width="200"/>
            <thead>
              <tr valign="top">
                <td>Review</td>
                <td colspan="2">RoBERTa coding</td>
                <td colspan="2">Hand-coding with reasoning</td>
              </tr>
              <tr valign="top">
                <td/>
                <td>Interpersonal manner</td>
                <td>Technical competence</td>
                <td>Interpersonal manner</td>
                <td>Technical competence</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>#metoo. He knows what he did.</td>
                <td>0<sup>b</sup></td>
                <td>0</td>
                <td>–<sup>c</sup> (Hints at sexual assault by the physician)</td>
                <td>0</td>
              </tr>
              <tr valign="top">
                <td>When someone is scared, I Think the Dr. should try to comfort them instead of telling them that when I finish this procedure, you will no longer be my patient . I’ll refer you to someone else. So I wish him the best.</td>
                <td>+<sup>d</sup></td>
                <td>0</td>
                <td>– (Feels physician did not provide comfort)</td>
                <td>0</td>
              </tr>
              <tr valign="top">
                <td>very rude receptionists answering phones, unhelpful, and sarcastic, they should be replaced ! they cant be bothered especially trish! very rude!</td>
                <td>–</td>
                <td>0</td>
                <td>0 (Discusses the interpersonal manner of the staff, not the physician)</td>
                <td>0</td>
              </tr>
              <tr valign="top">
                <td>Walking on the second day. My daughter also! I would send anyone I know his way... We’re from out of town and after using a local doctor here and having to do it all over again the experience was amazing and recovery was shorter</td>
                <td>0</td>
                <td>0</td>
                <td>0</td>
                <td>+ (Perceives treatment as a success)</td>
              </tr>
              <tr valign="top">
                <td>Awful experience! Thanks to another doctor that happened to see the urgency of my condition, I got the help that I needed. Had it been left to Dr. C<sup>e</sup>, God only knows where I’d be today. Avoid him!!!</td>
                <td>0</td>
                <td>+</td>
                <td>0</td>
                <td>– (Perceives poor physician decision-making)</td>
              </tr>
              <tr valign="top">
                <td>Excellent !!!!!! You will never ever find a better M.D.<break/>Caring, so professional an excellent surgeon with compassion??</td>
                <td>+</td>
                <td>+</td>
                <td>+</td>
                <td>0 (Does not discuss physician’s expertise, treatment, or outcomes)</td>
              </tr>
              <tr valign="top">
                <td>Very similar to all the one star ratings, if 0 stars were an option I’d choose that. The follow up on patients are non-existent, which makes it very obvious that the surgeon just wants $. The staff is always rude. I wish they would treat their patients and their family members how they would like their own to be treated.<break/>My mom has had two infections where they removed her lymph nodes after trying to call them about this several times, we took her to a different doctor to have the site drained</td>
                <td>0</td>
                <td>0</td>
                <td>– (Feels physician prioritized money over care)</td>
                <td>– (Critiques physician’s lack of follow-up care)</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table2fn1">
              <p><sup>a</sup>RoBERTa: Robustly Optimized BERT (Bidirectional Encoder Representations from Transformers) Pretraining Approach.</p>
            </fn>
            <fn id="table2fn2">
              <p><sup>b</sup>“0” indicates no judgment coded.</p>
            </fn>
            <fn id="table2fn3">
              <p><sup>c</sup>“–” indicates negative judgment coded.</p>
            </fn>
            <fn id="table2fn4">
              <p><sup>d</sup>“+” indicates positive judgment coded.</p>
            </fn>
            <fn id="table2fn5">
              <p><sup>e</sup>Physician surname reproduced here by only its first letter.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
      </sec>
      <sec>
        <title>Testing the Validity of the 2 Classification Models</title>
        <sec>
          <title>Overview</title>
          <p>We tested the validity of our classification models using the full sample of RoBERTa-coded reviews. We validated our models in 2 ways. First, we related our valence coding with the review star ratings (ie, 1- to 5-star ratings submitted with each review), with the expectation that positive-valence judgments would be associated with higher star ratings and negative-valence judgments would be associated with lower star ratings. Second, we compared our findings with prior literature on patients’ judgments of physicians in web-based physician reviews.</p>
        </sec>
        <sec>
          <title>Testing for Associations Between Judgment Valence and Star Ratings</title>
          <p>Using multilevel linear regressions, we analyzed associations between patients’ interpersonal manner and technical competence judgment valences and review star ratings. We found evidence of construct validity: positive valences for both interpersonal manner and technical competence were significantly positively associated with review star ratings whereas negative valences for both judgment dimensions were significantly negatively associated with review star ratings. Compared with reviews with no or negative judgment, reviews with positive interpersonal manner were associated with 1.82 (95% CI 1.81-1.83; <italic>P</italic>&lt;.001) more stars, and reviews with positive technical competence were associated with 1.50 (95% CI 1.49-1.51; <italic>P</italic>&lt;.001) more stars. In contrast, compared with reviews with no or positive judgment, reviews with negative interpersonal manner were associated with 3.30 (95% CI –3.31 to –3.29; <italic>P</italic>&lt;.001) fewer stars and reviews with negative technical competence were associated with 3.00 (95% CI –3.01 to –2.98; <italic>P</italic>&lt;.001) fewer stars. <xref rid="figure3" ref-type="fig">Figure 3</xref> displays mean review star ratings for each judgment dimension.</p>
          <fig id="figure3" position="float">
            <label>Figure 3</label>
            <caption>
              <p>Mean review star ratings with SDs, for reviews with negative, no, and positive interpersonal manner or technical competence.</p>
            </caption>
            <graphic xlink:href="jmir_v26i1e50236_fig3.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </fig>
        </sec>
        <sec>
          <title>Testing Whether the Models Reproduce Prior Findings</title>
          <p>One study analyzing 712 reviews determined that 69% of interpersonal manner reviews and 80% of technical competence reviews were positive [<xref ref-type="bibr" rid="ref1">1</xref>]. We identified a similar pattern of majority positive reviews; 207,327 (81%) reviews mentioning interpersonal manner and 178,705 (82%) reviews mentioning technical competence were positive. Another study reported physicians who received reviews with interpersonal manner language were at least 2.39 times more likely to receive a 5-star review rating [<xref ref-type="bibr" rid="ref9">9</xref>]. We similarly found physicians who received interpersonal manner reviews had 1.69 times the odds of receiving a 5-star review rating (95% CI 1.65-1.73; <italic>P</italic>&lt;.001). When controlling for physician gender, specialty, age, and practicing state, as well as review word count, physicians with interpersonal manner reviews continued to have higher odds of receiving a 5-star review rating (odds ratio [OR] 2.22, 95% CI 2.17-2.28; <italic>P</italic>&lt;.001).</p>
          <p>Prior research also showed female physicians, compared with male physicians, had higher odds of receiving reviews mentioning interpersonal manner [<xref ref-type="bibr" rid="ref9">9</xref>,<xref ref-type="bibr" rid="ref10">10</xref>]. Our findings supported these results: we determined female physicians had 1.56 times the odds of receiving a review mentioning interpersonal manner than male physicians (95% CI 1.53-1.59; <italic>P</italic>&lt;.001). When controlling for physician specialty, age, practicing state, and review word count, female physicians still had significantly higher odds of receiving an interpersonal manner review (OR 1.19, 95% CI 1.17-1.22; <italic>P</italic>&lt;.001).</p>
          <p>One group of investigators demonstrated female physicians were more likely than male physicians to receive both reviews praising and reviews criticizing their interpersonal manner [<xref ref-type="bibr" rid="ref10">10</xref>]. Consistent with these results, we found female physicians had 1.40 times the odds of receiving a negative review about their interpersonal manner than male physicians (95% CI 1.36-1.44; <italic>P</italic>&lt;.001). This significant difference remained when including controls (OR 1.25, 95% CI 1.21-1.29; <italic>P</italic>&lt;.001). Female physicians also had 1.18 times the odds of receiving a positive interpersonal manner review than male physicians (95% CI 1.15-1.20; <italic>P</italic>&lt;.001); however, this gender difference was not significant when including controls (OR 1.02, 95% CI 1.00-1.04; <italic>P</italic>=.05).</p>
          <p>Likewise, another study concluded that highly rated male physicians were 1.48 times more likely to receive reviews describing technical competence whereas highly rated female physicians were 2.11 times more likely to receive reviews describing interpersonal manner [<xref ref-type="bibr" rid="ref23">23</xref>]. We found similar results: Highly rated female physicians had 1.76 higher odds of receiving an interpersonal manner review (95% CI 1.72-1.81; <italic>P</italic>&lt;.001), which was still significant after including controls (OR 1.25, 95% CI 1.22-1.29; <italic>P</italic>&lt;.001). Highly rated male physicians had 1.33 higher odds of receiving a technical competence review (95% CI 1.30-1.36; <italic>P</italic>&lt;.001); however, with controls, this gender difference flipped, such that highly rated females were more likely to receive a technical competence review (OR 0.95, 95% CI 0.93-0.98; <italic>P</italic>&lt;.001).</p>
        </sec>
      </sec>
    </sec>
    <sec sec-type="discussion">
      <title>Discussion</title>
      <sec>
        <title>Principal Results</title>
        <p>Our 2 classification models identified the presence and valence of patients’ interpersonal manner and technical competence judgments with high precision, recall, and accuracy. Our models identified these 2 judgment dimensions from a broad training data set inclusive of reviews for female and male physicians, for primary care physicians and surgeons, and for low- and high-star-rated reviews.</p>
        <p>Our interpersonal manner and technical competence models underperformed in classifying reviews with no judgment and negative valence relative to reviews with positive valence. However, the overall predictive accuracy of both models (90%) was higher than the rate of hand-coding agreement among the 4 investigators (Cohen κ=0.84 and 0.77 for interpersonal manner and technical competence, respectively).</p>
        <p>Our models produced classification metrics comparable to those found in other studies that use fine-tuned RoBERTa algorithms for coding tasks. For example, researchers who used RoBERTa to detect polarizing versus nonpolarizing rhetoric in tweets written by Congress members reported a model with 90% predictive accuracy and a weighted <italic>F</italic><sub>1</sub>-score of 0.90 [<xref ref-type="bibr" rid="ref13">13</xref>]. They compared their results to the Valence Aware Dictionary and Sentiment Reasoner, a dictionary-based sentiment analysis model, which demonstrated a 68% accuracy and <italic>F</italic><sub>1</sub>-score of 0.75. Another study that used RoBERTa to classify 5 classes of mental illness in Reddit posts reported a model with an <italic>F</italic><sub>1</sub>-score of 0.86 [<xref ref-type="bibr" rid="ref24">24</xref>]. These researchers showed RoBERTa outperformed both BERT and long short-term memory, a nontransformer neural network text classifier (86% accuracy vs 82% and 72%). Last, researchers who forecasted star ratings from physician reviews written on RateMDs.com demonstrated an 84.6% accuracy and a mean <italic>F</italic><sub>1</sub>-score of 0.83 with their RoBERTa model, which outperformed other NLP models [<xref ref-type="bibr" rid="ref25">25</xref>]. In comparison, our interpersonal manner and technical competence models were each 90% accurate with weighted <italic>F</italic><sub>1</sub>-scores of 0.89 and 0.90, respectively.</p>
        <p>Although we did not compare our own RoBERTa models to other NLP algorithms, our accuracy scores perform equal to or better than prior methods used to code patients’ judgments in web-based physician reviews. For example, in one study, investigators who hand-coded reviews for 4 broad thematic categories, including interpersonal manner and technical competence, reported an interrater reliability range of κ=0.8-1.0 [<xref ref-type="bibr" rid="ref1">1</xref>]. Another study, which used dictionary-based text analysis to code for positive and negative soft skills reported a mean accuracy of 0.76 (range 0.42-0.92) [<xref ref-type="bibr" rid="ref10">10</xref>]. The rates of hand-coding agreement for interpersonal manner and technical competence among our own 4 investigators were Cohen κ=0.84 and 0.77, respectively. Last, research has shown RoBERTa outperforms both other pretrained models and traditional machine-learning models (eg, support vector machines and random forests) when used for text classification tasks in the health domain [<xref ref-type="bibr" rid="ref16">16</xref>,<xref ref-type="bibr" rid="ref26">26</xref>].</p>
        <p>Our NLP classification models overcome several limitations of prior research using hand-coding and dictionary-based methods to identify the prevalence and valence of patient judgments in web-based physician reviews. Hand-coding, although considered the gold standard, is time-intensive, which limits scalability. Prior studies using multiple coders could only analyze data from sample sizes of fewer than 1000 physician reviews [<xref ref-type="bibr" rid="ref1">1</xref>,<xref ref-type="bibr" rid="ref27">27</xref>]. Dictionary-based approaches enable analysis of larger samples but are restricted to the keywords contained in their dictionaries. Dictionary-based or bag-of-words models may overlook misspellings and jargon (eg, “he butchered my surgery”), leading to false negatives. They may also misidentify judgments about nonphysician staff (eg, front desk worker and nurse) or pick up words that have different meanings in different contexts (eg, “he is thorough in his examinations” vs “she gives thorough explanations”), leading to false positives. Dictionary-based models also have difficulty distinguishing between words used positively or with negations (eg, “she was smart” vs “she was not smart”), which complicates valence estimates. Prior research on physician reviews using dictionary-based models could not determine valence and only coded reviews that contained at least 1 preselected dictionary keyword [<xref ref-type="bibr" rid="ref9">9</xref>,<xref ref-type="bibr" rid="ref10">10</xref>,<xref ref-type="bibr" rid="ref28">28</xref>,<xref ref-type="bibr" rid="ref29">29</xref>].</p>
        <p>We validated our classification models by examining associations between our coded judgment valence and review star ratings and by comparing our coded judgments to findings from prior studies. We found positive interpersonal manner and technical competence judgments were associated with higher review star ratings whereas negative judgments were associated with lower review star ratings. Additionally, we found similar patterns of results with other studies that have examined the presence and valence of patients’ judgments in web-based physician reviews. Future research should examine how both interpersonal manner and technical competence judgments vary depending on both physician gender and specialty.</p>
        <p>We demonstrate that fine-tuning RoBERTa classification models to code patients’ interpersonal manner and technical competence judgments in web-based physician reviews offers a scalable, reliable, and accurate method for analyzing unstructured textual review data. To our knowledge, we are the first to use an advanced NLP algorithm to code a large data set of web-based physician reviews for patients’ judgments. This algorithm successfully coded web-based physician reviews, suggesting that RoBERTa may also be used to code similar unstructured text, including reviews from other commercial physician review websites (eg, RateMDs and ZocDoc) and from traditional Press Ganey patient-experience surveys. Whereas research has begun to use BERT-based models to extract health care insights from triage notes and medical records [<xref ref-type="bibr" rid="ref30">30</xref>-<xref ref-type="bibr" rid="ref32">32</xref>], future research is needed to ascertain the effectiveness of RoBERTa models with more far-afield text, such as crowdfunding campaigns, social media posts, and recommendation letters.</p>
        <p>We acknowledge patients’ judgments of their physicians’ technical competence should be taken with caution. Prior research has shown weak correlations between patients’ assessments of technical care quality and evidence-based indicators from clinical records [<xref ref-type="bibr" rid="ref33">33</xref>]. Certifying boards and professional societies are better equipped to assess physicians’ technical skills, such as knowledge of diagnostic and therapeutic advances [<xref ref-type="bibr" rid="ref34">34</xref>]; however, patients’ written reviews may be useful in offering reports of what actually occurred during clinical encounters, such as whether the physician checked their blood pressure or offered a flu vaccine [<xref ref-type="bibr" rid="ref35">35</xref>]. Thus, for patient reviews to improve clinical care quality, future classification models for technical competence may consider focusing more narrowly on patient reports of technical processes rather than general perceptions of physicians’ technical skills.</p>
      </sec>
      <sec>
        <title>Limitations</title>
        <p>Our study has 3 broad limitations, which arise from both the training data and the algorithm output. First, despite representing a small sample of our data set, our 2000 hand-coded reviews took substantial time and resources. It is possible that future studies can use smaller samples to train RoBERTa for classification tasks while maintaining high accuracy. Although more research is needed, we explored this idea by re-fine-tuning our models with gradually smaller training data sets (ie, n=1000, 500, and 250 reviews). For both interpersonal manner and technical competence, accuracy decreased with smaller training samples (interpersonal manner: 90%, 87%, 82%, and 64%; technical competence: 90%, 86%, 72%, and 64%). This brief example shows that future researchers could use 1000, possibly 500, but likely not 250 hand-coded reviews to train RoBERTa for multiclass classification. However, researchers should consider the implications of not only accuracy but also precision and recall for each code.</p>
        <p>Second, our RoBERTa models were only as good as our training data, and our training data was imperfect. Although hand-coding is considered the gold standard and team members rigorously followed a coding framework, biases in how individual coders identified patients’ interpersonal manner and technical competence judgments may have influenced the RoBERTa models. The imperfect interrater reliability present within the hand-coded data set is evidence of differences between coders, which may have complicated the fine-tuning of the models. Additionally, the 2000 hand-coded reviews represented only 0.6% of all reviews in the final data set; thus, our models could have been overfitted to this relatively small training data set.</p>
        <p>We also excluded the classification of certain reviews in our training data. Because of our own language barriers, we trained the models on reviews only written in English. Reviews written in other languages, such as Spanish, were not translated and thus received codes of no interpersonal manner and no technical competence, despite potentially describing either judgment dimension. Reviews written in languages other than English, however, represented a small proportion of the total reviews in our sample. In addition, we only trained the models to classify patient judgments of physicians’ interpersonal manner and technical competence, ignoring other judgments. Other judgments categorized as neither interpersonal manner nor technical competence included global remarks (eg, “would definitely recommend to others!” or “the worst”) and system-level comments about the office, staff, or other aspects of the health care experience (eg, “dingy building” or “his assistant was the most wonderful person I have ever met”).</p>
        <p>Third, our RoBERTa models had limitations. Although RoBERTa offers a more advanced NLP algorithm than dictionary-based methods, the algorithm may still not recognize cultural jargon. The first illustrative example of <xref ref-type="table" rid="table2">Table 2</xref>, in which RoBERTa did not recognize the connotation of the #metoo reference, demonstrates this limitation. Moreover, because the RoBERTa algorithm was not trained on a prebuilt dictionary but on a reference set of hand-coded reviews, it is difficult to determine specific words and phrases the models used when classifying interpersonal manner and technical competence judgments. This transparency limitation, often called “black box AI” is a common problem with deep learning algorithms that create their own neural networks for categorization [<xref ref-type="bibr" rid="ref36">36</xref>,<xref ref-type="bibr" rid="ref37">37</xref>].</p>
        <p>Despite these limitations, practical benefits of applying advanced NLP algorithms, like RoBERTa, to physician reviews include the enhanced capability to review feedback on what patients like and dislike about their medical encounters at clinician, department, or hospital levels; assistance in discerning differences in physician reviews received in traditional versus web-based surveys; and support in identifying patient biases, if any, corresponding with physician demographics. The benefits of these large language models also extend beyond insights from physician reviews. For example, advanced NLP models can improve patient care through medical information retrieval from medical literature, drug databases, and treatment guidelines; and through personalized clinical decision support by analyzing relevant patient data, such as medical histories, test results, and clinician notes. These models can also reduce physician workload via documentation assistance.</p>
      </sec>
      <sec>
        <title>Conclusion</title>
        <p>We coded a large data set of web-based physician reviews for the presence and valence of patients’ interpersonal manner and technical competence judgments using RoBERTa, a pretrained NLP classification algorithm. We trained and tested our models using a gold standard data set of hand-coded reviews and demonstrated that our models accurately and reliably coded interpersonal manner and technical competence. We also validated the algorithm by comparing our RoBERTa-coded data set with review star ratings and results from prior literature. The RoBERTa algorithm overcomes text analysis limitations present in previous work by identifying patient judgments in a broad range of physician reviews accurately and at scale. Potential benefits of advanced NLP models pertain to web-based physician reviews and beyond, from helping physicians more efficiently assess patient feedback to improving physicians’ workload and patient care.</p>
      </sec>
    </sec>
  </body>
  <back>
    <app-group/>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">BERT</term>
          <def>
            <p>Bidirectional Encoder Representations from Transformers</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb2">NLP</term>
          <def>
            <p>natural language processing</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb3">OR</term>
          <def>
            <p>odds ratio</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb4">RoBERTa</term>
          <def>
            <p>Robustly Optimized BERT Pretraining Approach</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <ack>
      <p>FM was affiliated with the Sanford School of Public Policy at Duke University at the time of the study and is currently affiliated with the Center for Bioethics and Social Sciences in Medicine at the University of Michigan Medical School. KT was affiliated with the Fuqua School of Business at Duke University at the time of the study and is currently affiliated with the University of Washington School of Medicine. AC was affiliated with the Center for Advanced Hindsight at Duke University at the time of the study and is currently affiliated with the Department of Sociology at UCLA. JKD was affiliated with the Fuqua School of Business at Duke University at the time of the study and is currently affiliated with the Department of Population Health Sciences at Duke University School of Medicine.</p>
    </ack>
    <fn-group>
      <fn fn-type="conflict">
        <p>None declared.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>López</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Detz</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Ratanawongsa</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Sarkar</surname>
              <given-names>U</given-names>
            </name>
          </person-group>
          <article-title>What patients say about their doctors online: a qualitative content analysis</article-title>
          <source>J Gen Intern Med</source>
          <year>2012</year>
          <volume>27</volume>
          <issue>6</issue>
          <fpage>685</fpage>
          <lpage>692</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/22215270"/>
          </comment>
          <pub-id pub-id-type="doi">10.1007/s11606-011-1958-4</pub-id>
          <pub-id pub-id-type="medline">22215270</pub-id>
          <pub-id pub-id-type="pmcid">PMC3358396</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kilaru</surname>
              <given-names>AS</given-names>
            </name>
            <name name-style="western">
              <surname>Meisel</surname>
              <given-names>ZF</given-names>
            </name>
            <name name-style="western">
              <surname>Paciotti</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Ha</surname>
              <given-names>YP</given-names>
            </name>
            <name name-style="western">
              <surname>Smith</surname>
              <given-names>RJ</given-names>
            </name>
            <name name-style="western">
              <surname>Ranard</surname>
              <given-names>BL</given-names>
            </name>
            <name name-style="western">
              <surname>Merchant</surname>
              <given-names>RM</given-names>
            </name>
          </person-group>
          <article-title>What do patients say about emergency departments in online reviews? a qualitative study</article-title>
          <source>BMJ Qual Saf</source>
          <year>2016</year>
          <volume>25</volume>
          <issue>1</issue>
          <fpage>14</fpage>
          <lpage>24</lpage>
          <pub-id pub-id-type="doi">10.1136/bmjqs-2015-004035</pub-id>
          <pub-id pub-id-type="medline">26208538</pub-id>
          <pub-id pub-id-type="pii">bmjqs-2015-004035</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ranard</surname>
              <given-names>BL</given-names>
            </name>
            <name name-style="western">
              <surname>Werner</surname>
              <given-names>RM</given-names>
            </name>
            <name name-style="western">
              <surname>Antanavicius</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Schwartz</surname>
              <given-names>HA</given-names>
            </name>
            <name name-style="western">
              <surname>Smith</surname>
              <given-names>RJ</given-names>
            </name>
            <name name-style="western">
              <surname>Meisel</surname>
              <given-names>ZF</given-names>
            </name>
            <name name-style="western">
              <surname>Asch</surname>
              <given-names>DA</given-names>
            </name>
            <name name-style="western">
              <surname>Ungar</surname>
              <given-names>LH</given-names>
            </name>
            <name name-style="western">
              <surname>Merchant</surname>
              <given-names>RM</given-names>
            </name>
          </person-group>
          <article-title>Yelp reviews of hospital care can supplement and inform traditional surveys of the patient experience of care</article-title>
          <source>Health Aff (Millwood)</source>
          <year>2016</year>
          <volume>35</volume>
          <issue>4</issue>
          <fpage>697</fpage>
          <lpage>705</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/27044971"/>
          </comment>
          <pub-id pub-id-type="doi">10.1377/hlthaff.2015.1030</pub-id>
          <pub-id pub-id-type="medline">27044971</pub-id>
          <pub-id pub-id-type="pii">35/4/697</pub-id>
          <pub-id pub-id-type="pmcid">PMC4845957</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Xu</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Armony</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Ghose</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>The interplay between online reviews and physician demand: an empirical investigation</article-title>
          <source>Manage Sci</source>
          <year>2021</year>
          <volume>67</volume>
          <issue>12</issue>
          <fpage>7344</fpage>
          <lpage>7361</lpage>
          <pub-id pub-id-type="doi">10.1287/mnsc.2020.3879</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Trehan</surname>
              <given-names>SK</given-names>
            </name>
            <name name-style="western">
              <surname>DeFrancesco</surname>
              <given-names>CJ</given-names>
            </name>
            <name name-style="western">
              <surname>Nguyen</surname>
              <given-names>JT</given-names>
            </name>
            <name name-style="western">
              <surname>Charalel</surname>
              <given-names>RA</given-names>
            </name>
            <name name-style="western">
              <surname>Daluiski</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Online patient ratings of hand surgeons</article-title>
          <source>J Hand Surg Am</source>
          <year>2016</year>
          <volume>41</volume>
          <issue>1</issue>
          <fpage>98</fpage>
          <lpage>103</lpage>
          <pub-id pub-id-type="doi">10.1016/j.jhsa.2015.10.006</pub-id>
          <pub-id pub-id-type="medline">26710742</pub-id>
          <pub-id pub-id-type="pii">S0363-5023(15)01328-3</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref6">
        <label>6</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Murphy</surname>
              <given-names>GP</given-names>
            </name>
            <name name-style="western">
              <surname>Radadia</surname>
              <given-names>KD</given-names>
            </name>
            <name name-style="western">
              <surname>Breyer</surname>
              <given-names>BN</given-names>
            </name>
          </person-group>
          <article-title>Online physician reviews: is there a place for them?</article-title>
          <source>Risk Manag Healthc Policy</source>
          <year>2019</year>
          <volume>12</volume>
          <fpage>85</fpage>
          <lpage>89</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/31191060"/>
          </comment>
          <pub-id pub-id-type="doi">10.2147/RMHP.S170381</pub-id>
          <pub-id pub-id-type="medline">31191060</pub-id>
          <pub-id pub-id-type="pii">170381</pub-id>
          <pub-id pub-id-type="pmcid">PMC6526774</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref7">
        <label>7</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Burkle</surname>
              <given-names>CM</given-names>
            </name>
            <name name-style="western">
              <surname>Keegan</surname>
              <given-names>MT</given-names>
            </name>
          </person-group>
          <article-title>Popularity of internet physician rating sites and their apparent influence on patients' choices of physicians</article-title>
          <source>BMC Health Serv Res</source>
          <year>2015</year>
          <volume>15</volume>
          <fpage>416</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://bmchealthservres.biomedcentral.com/articles/10.1186/s12913-015-1099-2"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/s12913-015-1099-2</pub-id>
          <pub-id pub-id-type="medline">26410383</pub-id>
          <pub-id pub-id-type="pii">10.1186/s12913-015-1099-2</pub-id>
          <pub-id pub-id-type="pmcid">PMC4583763</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref8">
        <label>8</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Emmert</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Meszmer</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Sander</surname>
              <given-names>U</given-names>
            </name>
          </person-group>
          <article-title>Do health care providers use online patient ratings to improve the quality of care? results from an online-based cross-sectional study</article-title>
          <source>J Med Internet Res</source>
          <year>2016</year>
          <volume>18</volume>
          <issue>9</issue>
          <fpage>e254</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.jmir.org/2016/9/e254/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/jmir.5889</pub-id>
          <pub-id pub-id-type="medline">27644135</pub-id>
          <pub-id pub-id-type="pii">v18i9e254</pub-id>
          <pub-id pub-id-type="pmcid">PMC5048057</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref9">
        <label>9</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Pierson</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Schmer-Galunder</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Altamirano</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Jurafsky</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Leskovec</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Fassiotto</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Kothary</surname>
              <given-names>N</given-names>
            </name>
          </person-group>
          <article-title>Gender differences in patient perceptions of physicians' communal traits and the impact on physician evaluations</article-title>
          <source>J Womens Health (Larchmt)</source>
          <year>2021</year>
          <volume>30</volume>
          <issue>4</issue>
          <fpage>551</fpage>
          <lpage>556</lpage>
          <pub-id pub-id-type="doi">10.1089/jwh.2019.8233</pub-id>
          <pub-id pub-id-type="medline">32857642</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref10">
        <label>10</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Dunivin</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Zadunayski</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Baskota</surname>
              <given-names>U</given-names>
            </name>
            <name name-style="western">
              <surname>Siek</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Mankoff</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Gender, soft skills, and patient experience in online physician reviews: a large-scale text analysis</article-title>
          <source>J Med Internet Res</source>
          <year>2020</year>
          <volume>22</volume>
          <issue>7</issue>
          <fpage>e14455</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.jmir.org/2020/7/e14455/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/14455</pub-id>
          <pub-id pub-id-type="medline">32729844</pub-id>
          <pub-id pub-id-type="pii">v22i7e14455</pub-id>
          <pub-id pub-id-type="pmcid">PMC7426798</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref11">
        <label>11</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Nelson</surname>
              <given-names>LK</given-names>
            </name>
            <name name-style="western">
              <surname>Burk</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Knudsen</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>McCall</surname>
              <given-names>L</given-names>
            </name>
          </person-group>
          <article-title>The future of coding: a comparison of hand-coding and three types of computer-assisted text analysis methods</article-title>
          <source>Sociol Methods Res</source>
          <year>2018</year>
          <volume>50</volume>
          <issue>1</issue>
          <fpage>202</fpage>
          <lpage>237</lpage>
          <pub-id pub-id-type="doi">10.1177/0049124118769114</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref12">
        <label>12</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Li</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Jin</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Quan</surname>
              <given-names>Y</given-names>
            </name>
          </person-group>
          <article-title>Review of research on text sentiment analysis based on deep learning</article-title>
          <source>Open Access Library J</source>
          <year>2020</year>
          <volume>07</volume>
          <issue>03</issue>
          <fpage>1</fpage>
          <lpage>8</lpage>
          <pub-id pub-id-type="doi">10.4236/oalib.1106174</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref13">
        <label>13</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ballard</surname>
              <given-names>AO</given-names>
            </name>
            <name name-style="western">
              <surname>DeTamble</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Dorsey</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Heseltine</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Johnson</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Dynamics of polarizing rhetoric in congressional tweets</article-title>
          <source>Legislative Studies Qtrly</source>
          <year>2022</year>
          <volume>48</volume>
          <issue>1</issue>
          <fpage>105</fpage>
          <lpage>144</lpage>
          <pub-id pub-id-type="doi">10.1111/lsq.12374</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref14">
        <label>14</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Oliveira</surname>
              <given-names>FB</given-names>
            </name>
            <name name-style="western">
              <surname>Haque</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Mougouei</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Evans</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Sichman</surname>
              <given-names>JS</given-names>
            </name>
            <name name-style="western">
              <surname>Singh</surname>
              <given-names>MP</given-names>
            </name>
          </person-group>
          <article-title>Investigating the emotional response to covid-19 news on twitter: a topic modeling and emotion classification approach</article-title>
          <source>IEEE Access</source>
          <year>2022</year>
          <volume>10</volume>
          <fpage>16883</fpage>
          <lpage>16897</lpage>
          <pub-id pub-id-type="doi">10.1109/access.2022.3150329</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref15">
        <label>15</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Ott</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Goyal</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Du</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Joshi</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Levy</surname>
              <given-names>O</given-names>
            </name>
            <name name-style="western">
              <surname>Mike</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Zettlemoyer</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Veselin</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Roberta: a robustly optimized bert pretraining approach</article-title>
          <source>arXiv preprint arXiv</source>
          <year>2019</year>
          <fpage>190711692</fpage>
          <pub-id pub-id-type="doi">10.48550/arXiv.1907.11692</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref16">
        <label>16</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Guo</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Dong</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Al-Garadi</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Sarker</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Paris</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Aliod</surname>
              <given-names>DM</given-names>
            </name>
          </person-group>
          <article-title>Benchmarking of transformer-based pre-trained models on social media text classification datasets</article-title>
          <year>2020</year>
          <conf-name>Proceedings of the the 18th annual workshop of the australasian language technology association</conf-name>
          <conf-date>2020</conf-date>
          <conf-loc>Australia</conf-loc>
        </nlm-citation>
      </ref>
      <ref id="ref17">
        <label>17</label>
        <nlm-citation citation-type="web">
          <article-title>Healthgrades frequently asked questions (FAQs)</article-title>
          <source>healthgrades.com</source>
          <access-date>2022-10-08</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.healthgrades.com/content/faqs">https://www.healthgrades.com/content/faqs</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref18">
        <label>18</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Vaswani</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Shazeer</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Parmar</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Uszkoreit</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Jones</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Gomez</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Attention is all you need</article-title>
          <year>2017</year>
          <conf-name>Advances in Neural Information Processing Systems</conf-name>
          <conf-date>2017 June 12</conf-date>
          <conf-loc>Long Beach, CA, USA</conf-loc>
        </nlm-citation>
      </ref>
      <ref id="ref19">
        <label>19</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Liao</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Zeng</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Yin</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Wei</surname>
              <given-names>P</given-names>
            </name>
          </person-group>
          <article-title>An improved aspect-category sentiment analysis model for text sentiment analysis based on RoBERTa</article-title>
          <source>Appl Intell</source>
          <year>2020</year>
          <volume>51</volume>
          <issue>6</issue>
          <fpage>3522</fpage>
          <lpage>3533</lpage>
          <pub-id pub-id-type="doi">10.1007/s10489-020-01964-1</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref20">
        <label>20</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Devlin</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Chang</surname>
              <given-names>MW</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Toutanova</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>Bert: pre-training of deep bidirectional transformers for language understanding</article-title>
          <source>arXiv</source>
          <year>2018</year>
          <fpage>181004805</fpage>
          <pub-id pub-id-type="doi">10.48550/arXiv.1810.04805</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref21">
        <label>21</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Rajapkse</surname>
              <given-names>T</given-names>
            </name>
          </person-group>
          <article-title>Simple transformers</article-title>
          <source>Python Software Foundation</source>
          <access-date>2024-02-15</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://pypi.org/project/simpletransformers/">https://pypi.org/project/simpletransformers/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref22">
        <label>22</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wolf</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Debut</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Sanh</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Chaumond</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Delangue</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Moi</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Cistac</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Rault</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Louf</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Funtowicz</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Davison</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Shleifer</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>von Platen</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Ma</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Jernite</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Plu</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Xu</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Le Scao</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Gugger</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Drame</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Lhoest</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Rush</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Huggingface's transformers: state-of-the-art natural language processing</article-title>
          <year>2020</year>
          <conf-name>Proceedings of the 2020 conference on empirical methods in natural language processing: system demonstrations</conf-name>
          <conf-loc>Online</conf-loc>
          <fpage>38</fpage>
          <lpage>45</lpage>
          <pub-id pub-id-type="doi">10.18653/v1/2020.emnlp-demos.6</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref23">
        <label>23</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Haynes</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Pampari</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Topham</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Schwarzenberger</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Heath</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Zou</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Greiling</surname>
              <given-names>TM</given-names>
            </name>
          </person-group>
          <article-title>Patient experience surveys reveal gender-biased descriptions of their care providers</article-title>
          <source>J Med Syst</source>
          <year>2021</year>
          <volume>45</volume>
          <issue>10</issue>
          <fpage>90</fpage>
          <pub-id pub-id-type="doi">10.1007/s10916-021-01766-z</pub-id>
          <pub-id pub-id-type="medline">34468879</pub-id>
          <pub-id pub-id-type="pii">10.1007/s10916-021-01766-z</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref24">
        <label>24</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Murarka</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Radhakrishnan</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Ravichandran</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Classification of mental illnesses on social media using RoBERTa</article-title>
          <year>2021</year>
          <conf-name>Proceedings of the 12th International Workshop on Health Text Mining and Information Analysis</conf-name>
          <conf-date>2021 Apr 19</conf-date>
          <conf-loc>IBM / Raleigh, NC</conf-loc>
          <fpage>59</fpage>
          <lpage>68</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref25">
        <label>25</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Jhaveri</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Gandhi</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Naik</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Nisar</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Sonawane</surname>
              <given-names>P</given-names>
            </name>
          </person-group>
          <article-title>Predicting doctor ratings from user reviews using deep learning</article-title>
          <year>2022</year>
          <conf-name>2022 International Conference on Applied Artificial Intelligence and Computing (ICAAIC)</conf-name>
          <conf-date>2022 May 12</conf-date>
          <conf-loc>Salem, India</conf-loc>
          <pub-id pub-id-type="doi">10.1109/icaaic53929.2022.9793206</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref26">
        <label>26</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Al-Garadi</surname>
              <given-names>MA</given-names>
            </name>
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Cai</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Ruan</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>O'Connor</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Graciela</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Perrone</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Sarker</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Text classification models for the automatic detection of nonmedical prescription medication use from social media</article-title>
          <source>BMC Med Inform Decis Mak</source>
          <year>2021</year>
          <volume>21</volume>
          <issue>1</issue>
          <fpage>27</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://bmcmedinformdecismak.biomedcentral.com/articles/10.1186/s12911-021-01394-0"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/s12911-021-01394-0</pub-id>
          <pub-id pub-id-type="medline">33499852</pub-id>
          <pub-id pub-id-type="pii">10.1186/s12911-021-01394-0</pub-id>
          <pub-id pub-id-type="pmcid">PMC7835447</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref27">
        <label>27</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Marrero</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>King</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Fingeret</surname>
              <given-names>AL</given-names>
            </name>
          </person-group>
          <article-title>Impact of surgeon gender on online physician reviews</article-title>
          <source>J Surg Res</source>
          <year>2020</year>
          <volume>245</volume>
          <fpage>510</fpage>
          <lpage>515</lpage>
          <pub-id pub-id-type="doi">10.1016/j.jss.2019.07.047</pub-id>
          <pub-id pub-id-type="medline">31446193</pub-id>
          <pub-id pub-id-type="pii">S0022-4804(19)30541-4</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref28">
        <label>28</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Gupta</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Jordan</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>Understanding gender bias toward physicians using online doctor reviews</article-title>
          <source>Psychol Lang Commun</source>
          <year>2022</year>
          <volume>26</volume>
          <issue>1</issue>
          <fpage>18</fpage>
          <lpage>41</lpage>
          <pub-id pub-id-type="doi">10.2478/plc-2022-0002</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref29">
        <label>29</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Saifee</surname>
              <given-names>DH</given-names>
            </name>
            <name name-style="western">
              <surname>Hudnall</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Raja</surname>
              <given-names>U</given-names>
            </name>
          </person-group>
          <article-title>Physician gender, patient risk, and web-based reviews: longitudinal study of the relationship between physicians' gender and their web-based reviews</article-title>
          <source>J Med Internet Res</source>
          <year>2022</year>
          <volume>24</volume>
          <issue>4</issue>
          <fpage>e31659</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.jmir.org/2022/4/e31659/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/31659</pub-id>
          <pub-id pub-id-type="medline">35394435</pub-id>
          <pub-id pub-id-type="pii">v24i4e31659</pub-id>
          <pub-id pub-id-type="pmcid">PMC9034420</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref30">
        <label>30</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Arnaud</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Elbattah</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Gignon</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Dequen</surname>
              <given-names>G</given-names>
            </name>
          </person-group>
          <article-title>Learning embeddings from free-text triage notes using pretrained transformer models</article-title>
          <source>HEALTHINF</source>
          <year>2022</year>
          <volume>5</volume>
          <fpage>835</fpage>
          <lpage>841</lpage>
          <pub-id pub-id-type="doi">10.5220/0011012800003123</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref31">
        <label>31</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zeng</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Pan</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Xu</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Qu</surname>
              <given-names>Y</given-names>
            </name>
          </person-group>
          <article-title>An ensemble learning strategy for eligibility criteria text classification for clinical trial recruitment: algorithm development and validation</article-title>
          <source>JMIR Med Inform</source>
          <year>2020</year>
          <volume>8</volume>
          <issue>7</issue>
          <fpage>e17832</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://medinform.jmir.org/2020/7/e17832/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/17832</pub-id>
          <pub-id pub-id-type="medline">32609092</pub-id>
          <pub-id pub-id-type="pii">v8i7e17832</pub-id>
          <pub-id pub-id-type="pmcid">PMC7367522</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref32">
        <label>32</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Ni</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Artificial intelligence-based traditional Chinese medicine assistive diagnostic system: validation study</article-title>
          <source>JMIR Med Inform</source>
          <year>2020</year>
          <volume>8</volume>
          <issue>6</issue>
          <fpage>e17608</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://medinform.jmir.org/2020/6/e17608/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/17608</pub-id>
          <pub-id pub-id-type="medline">32538797</pub-id>
          <pub-id pub-id-type="pii">v8i6e17608</pub-id>
          <pub-id pub-id-type="pmcid">PMC7324998</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref33">
        <label>33</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Rao</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Clarke</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Sanderson</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Hammersley</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>Patients' own assessments of quality of primary care compared with objective records based measures of technical quality of care: cross sectional study</article-title>
          <source>BMJ</source>
          <year>2006</year>
          <volume>333</volume>
          <issue>7557</issue>
          <fpage>19</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/16793783"/>
          </comment>
          <pub-id pub-id-type="doi">10.1136/bmj.38874.499167.7C</pub-id>
          <pub-id pub-id-type="medline">16793783</pub-id>
          <pub-id pub-id-type="pii">bmj.38874.499167.7C</pub-id>
          <pub-id pub-id-type="pmcid">PMC1488754</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref34">
        <label>34</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Beliveau</surname>
              <given-names>ME</given-names>
            </name>
            <name name-style="western">
              <surname>Nishimura</surname>
              <given-names>RA</given-names>
            </name>
            <name name-style="western">
              <surname>O'Gara</surname>
              <given-names>P</given-names>
            </name>
          </person-group>
          <article-title>Physician competence: a perspective from the practicing cardiologist</article-title>
          <source>Methodist Debakey Cardiovasc J</source>
          <year>2014</year>
          <volume>10</volume>
          <issue>1</issue>
          <fpage>50</fpage>
          <lpage>52</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/24932364"/>
          </comment>
          <pub-id pub-id-type="doi">10.14797/mdcj-10-1-50</pub-id>
          <pub-id pub-id-type="medline">24932364</pub-id>
          <pub-id pub-id-type="pmcid">PMC4051335</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref35">
        <label>35</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Coulter</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Can patients assess the quality of health care?</article-title>
          <source>BMJ</source>
          <year>2006</year>
          <volume>333</volume>
          <issue>7557</issue>
          <fpage>1</fpage>
          <lpage>2</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/16809674"/>
          </comment>
          <pub-id pub-id-type="doi">10.1136/bmj.333.7557.1</pub-id>
          <pub-id pub-id-type="medline">16809674</pub-id>
          <pub-id pub-id-type="pii">333/7557/1</pub-id>
          <pub-id pub-id-type="pmcid">PMC1488733</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref36">
        <label>36</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ratti</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Graves</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Explainable machine learning practices: opening another black box for reliable medical AI</article-title>
          <source>AI Ethics</source>
          <year>2022</year>
          <volume>2</volume>
          <issue>4</issue>
          <fpage>801</fpage>
          <lpage>814</lpage>
          <pub-id pub-id-type="doi">10.1007/s43681-022-00141-z</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref37">
        <label>37</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>von Eschenbach</surname>
              <given-names>WJ</given-names>
            </name>
          </person-group>
          <article-title>Transparency and the black box problem: why we do not trust AI</article-title>
          <source>Philos Technol</source>
          <year>2021</year>
          <volume>34</volume>
          <issue>4</issue>
          <fpage>1607</fpage>
          <lpage>1622</lpage>
          <pub-id pub-id-type="doi">10.1007/s13347-021-00477-0</pub-id>
        </nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
