<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article article-type="research-article" dtd-version="2.0" xmlns:xlink="http://www.w3.org/1999/xlink">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JMIR</journal-id>
      <journal-id journal-id-type="nlm-ta">J Med Internet Res</journal-id>
      <journal-title>Journal of Medical Internet Research</journal-title>
      <issn pub-type="epub">1438-8871</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
      <article-id pub-id-type="publisher-id">v26i1e56655</article-id>
      <article-id pub-id-type="pmid">38630520</article-id>
      <article-id pub-id-type="doi">10.2196/56655</article-id>
      <article-categories>
        <subj-group subj-group-type="heading">
          <subject>Original Paper</subject>
        </subj-group>
        <subj-group subj-group-type="article-type">
          <subject>Original Paper</subject>
        </subj-group>
      </article-categories>
      <title-group>
        <article-title>Quality of Answers of Generative Large Language Models Versus Peer Users for Interpreting Laboratory Test Results for Lay Patients: Evaluation Study</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="editor">
          <name>
            <surname>Puladi</surname>
            <given-names>Behrus</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Chen</surname>
            <given-names>Yuhui</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Smutny</surname>
            <given-names>Zdenek</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib id="contrib1" contrib-type="author" corresp="yes">
          <name name-style="western">
            <surname>He</surname>
            <given-names>Zhe</given-names>
          </name>
          <degrees>MSc, PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <address>
            <institution>School of Information</institution>
            <institution>Florida State University</institution>
            <addr-line>142 Collegiate Loop</addr-line>
            <addr-line>Tallahassee, FL, 32306</addr-line>
            <country>United States</country>
            <phone>1 8506445775</phone>
            <email>zhe@fsu.edu</email>
          </address>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-3608-0244</ext-link>
        </contrib>
        <contrib id="contrib2" contrib-type="author">
          <name name-style="western">
            <surname>Bhasuran</surname>
            <given-names>Balu</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-9890-4627</ext-link>
        </contrib>
        <contrib id="contrib3" contrib-type="author">
          <name name-style="western">
            <surname>Jin</surname>
            <given-names>Qiao</given-names>
          </name>
          <degrees>MD</degrees>
          <xref rid="aff2" ref-type="aff">2</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-1268-7239</ext-link>
        </contrib>
        <contrib id="contrib4" contrib-type="author">
          <name name-style="western">
            <surname>Tian</surname>
            <given-names>Shubo</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff2" ref-type="aff">2</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-6415-1439</ext-link>
        </contrib>
        <contrib id="contrib5" contrib-type="author">
          <name name-style="western">
            <surname>Hanna</surname>
            <given-names>Karim</given-names>
          </name>
          <degrees>MD</degrees>
          <xref rid="aff3" ref-type="aff">3</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-2194-8875</ext-link>
        </contrib>
        <contrib id="contrib6" contrib-type="author">
          <name name-style="western">
            <surname>Shavor</surname>
            <given-names>Cindy</given-names>
          </name>
          <degrees>MD</degrees>
          <xref rid="aff3" ref-type="aff">3</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0009-0002-9088-5692</ext-link>
        </contrib>
        <contrib id="contrib7" contrib-type="author">
          <name name-style="western">
            <surname>Arguello</surname>
            <given-names>Lisbeth Garcia</given-names>
          </name>
          <degrees>MD</degrees>
          <xref rid="aff3" ref-type="aff">3</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-9190-7809</ext-link>
        </contrib>
        <contrib id="contrib8" contrib-type="author">
          <name name-style="western">
            <surname>Murray</surname>
            <given-names>Patrick</given-names>
          </name>
          <degrees>MD</degrees>
          <xref rid="aff3" ref-type="aff">3</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0009-0004-3584-0808</ext-link>
        </contrib>
        <contrib id="contrib9" contrib-type="author">
          <name name-style="western">
            <surname>Lu</surname>
            <given-names>Zhiyong</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff2" ref-type="aff">2</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-9998-916X</ext-link>
        </contrib>
      </contrib-group>
      <aff id="aff1">
        <label>1</label>
        <institution>School of Information</institution>
        <institution>Florida State University</institution>
        <addr-line>Tallahassee, FL</addr-line>
        <country>United States</country>
      </aff>
      <aff id="aff2">
        <label>2</label>
        <institution>National Center for Biotechnology Information, National Library of Medicine</institution>
        <institution>National Institutes of Health</institution>
        <addr-line>Bethesda, MD</addr-line>
        <country>United States</country>
      </aff>
      <aff id="aff3">
        <label>3</label>
        <institution>Morsani College of Medicine</institution>
        <institution>University of South Florida</institution>
        <addr-line>Tampa, FL</addr-line>
        <country>United States</country>
      </aff>
      <author-notes>
        <corresp>Corresponding Author: Zhe He <email>zhe@fsu.edu</email></corresp>
      </author-notes>
      <pub-date pub-type="collection">
        <year>2024</year>
      </pub-date>
      <pub-date pub-type="epub">
        <day>17</day>
        <month>4</month>
        <year>2024</year>
      </pub-date>
      <volume>26</volume>
      <elocation-id>e56655</elocation-id>
      <history>
        <date date-type="received">
          <day>23</day>
          <month>1</month>
          <year>2024</year>
        </date>
        <date date-type="rev-request">
          <day>1</day>
          <month>2</month>
          <year>2024</year>
        </date>
        <date date-type="rev-recd">
          <day>17</day>
          <month>2</month>
          <year>2024</year>
        </date>
        <date date-type="accepted">
          <day>6</day>
          <month>3</month>
          <year>2024</year>
        </date>
      </history>
      <copyright-statement>©Zhe He, Balu Bhasuran, Qiao Jin, Shubo Tian, Karim Hanna, Cindy Shavor, Lisbeth Garcia Arguello, Patrick Murray, Zhiyong Lu. Originally published in the Journal of Medical Internet Research (https://www.jmir.org), 17.04.2024.</copyright-statement>
      <copyright-year>2024</copyright-year>
      <license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
        <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (https://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in the Journal of Medical Internet Research, is properly cited. The complete bibliographic information, a link to the original publication on https://www.jmir.org/, as well as this copyright and license information must be included.</p>
      </license>
      <self-uri xlink:href="https://www.jmir.org/2024/1/e56655" xlink:type="simple"/>
      <abstract>
        <sec sec-type="background">
          <title>Background</title>
          <p>Although patients have easy access to their electronic health records and laboratory test result data through patient portals, laboratory test results are often confusing and hard to understand. Many patients turn to web-based forums or question-and-answer (Q&amp;A) sites to seek advice from their peers. The quality of answers from social Q&amp;A sites on health-related questions varies significantly, and not all responses are accurate or reliable. Large language models (LLMs) such as ChatGPT have opened a promising avenue for patients to have their questions answered.</p>
        </sec>
        <sec sec-type="objective">
          <title>Objective</title>
          <p>We aimed to assess the feasibility of using LLMs to generate relevant, accurate, helpful, and unharmful responses to laboratory test–related questions asked by patients and identify potential issues that can be mitigated using augmentation approaches.</p>
        </sec>
        <sec sec-type="methods">
          <title>Methods</title>
          <p>We collected laboratory test result–related Q&amp;A data from Yahoo! Answers and selected 53 Q&amp;A pairs for this study. Using the LangChain framework and ChatGPT web portal, we generated responses to the 53 questions from 5 LLMs: GPT-4, GPT-3.5, LLaMA 2, MedAlpaca, and ORCA_mini. We assessed the similarity of their answers using standard Q&amp;A similarity-based evaluation metrics, including Recall-Oriented Understudy for Gisting Evaluation, Bilingual Evaluation Understudy, Metric for Evaluation of Translation With Explicit Ordering, and Bidirectional Encoder Representations from Transformers Score. We used an LLM-based evaluator to judge whether a target model had higher quality in terms of relevance, correctness, helpfulness, and safety than the baseline model. We performed a manual evaluation with medical experts for all the responses to 7 selected questions on the same 4 aspects.</p>
        </sec>
        <sec sec-type="results">
          <title>Results</title>
          <p>Regarding the similarity of the responses from 4 LLMs; the GPT-4 output was used as the reference answer, the responses from GPT-3.5 were the most similar, followed by those from LLaMA 2, ORCA_mini, and MedAlpaca. Human answers from Yahoo data were scored the lowest and, thus, as the least similar to GPT-4–generated answers. The results of the win rate and medical expert evaluation both showed that GPT-4’s responses achieved better scores than all the other LLM responses and human responses on all 4 aspects (relevance, correctness, helpfulness, and safety). LLM responses occasionally also suffered from lack of interpretation in one’s medical context, incorrect statements, and lack of references.</p>
        </sec>
        <sec sec-type="conclusions">
          <title>Conclusions</title>
          <p>By evaluating LLMs in generating responses to patients’ laboratory test result–related questions, we found that, compared to other 4 LLMs and human answers from a Q&amp;A website, GPT-4’s responses were more accurate, helpful, relevant, and safer. There were cases in which GPT-4 responses were inaccurate and not individualized. We identified a number of ways to improve the quality of LLM responses, including prompt engineering, prompt augmentation, retrieval-augmented generation, and response evaluation.</p>
        </sec>
      </abstract>
      <kwd-group>
        <kwd>large language models</kwd>
        <kwd>generative artificial intelligence</kwd>
        <kwd>generative AI</kwd>
        <kwd>ChatGPT</kwd>
        <kwd>laboratory test results</kwd>
        <kwd>patient education</kwd>
        <kwd>natural language processing</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <sec sec-type="introduction">
      <title>Introduction</title>
      <sec>
        <title>Background</title>
        <p>In 2021, the United States spent US $4.3 trillion on health care, 53% of which was attributed to unnecessary use of hospital and clinic services [<xref ref-type="bibr" rid="ref1">1</xref>,<xref ref-type="bibr" rid="ref2">2</xref>]. Ballooning health care costs exacerbated by the rise in chronic diseases has shifted the focus of health care from medication and treatment to prevention and patient-centered care [<xref ref-type="bibr" rid="ref3">3</xref>]. In 2014, the US Department of Health and Human Services [<xref ref-type="bibr" rid="ref4">4</xref>] mandated that patients be given direct access to their laboratory test results. This improves the ability of patients to monitor results over time, follow up on abnormal test findings with their providers in a more timely manner, and prepare them for follow-up visits with their physicians [<xref ref-type="bibr" rid="ref5">5</xref>]. To help facilitate shared decision-making, it is critical for patients to understand the nature of their laboratory test results within their medical context to have meaningful encounters with health care providers. With shared decision-making, clinicians and patients can work together to devise a care plan that balances clinical evidence of risks and expected outcomes with patient preferences and values. Current workflows in electronic health records with the 21st Century Cures Act [<xref ref-type="bibr" rid="ref6">6</xref>] allow patients to have direct access to notes and laboratory test results. In fact, accessing laboratory test results is the most frequent activity patients perform when they use patient portals [<xref ref-type="bibr" rid="ref5">5</xref>,<xref ref-type="bibr" rid="ref7">7</xref>]. However, despite the potential benefits of patient portals, merely providing patients with access to their records is insufficient for improving patient engagement in their care because laboratory test results can be highly confusing and access may often be without adequate guidance or interpretation [<xref ref-type="bibr" rid="ref8">8</xref>]. Laboratory test results are often presented in tabular format, similar to the format used by clinicians [<xref ref-type="bibr" rid="ref9">9</xref>,<xref ref-type="bibr" rid="ref10">10</xref>]. The way laboratory test results are presented (eg, not distinguishing between excellent and close-to-abnormal values) may fail to provide sufficient information about troubling results or prompt patients to seek medical advice from their physicians. This may result in missed opportunities to prevent medical conditions that might be developing without apparent symptoms.</p>
        <p>Various studies have found a significant inverse relationship between health literacy and numeracy and the ability to make sense of laboratory test results [<xref ref-type="bibr" rid="ref11">11</xref>-<xref ref-type="bibr" rid="ref14">14</xref>]. Patients with limited health literacy are more likely to misinterpret or misunderstand their laboratory test results (either overestimating or underestimating their results), which in turn may delay them seeking critical medical attention [<xref ref-type="bibr" rid="ref5">5</xref>,<xref ref-type="bibr" rid="ref7">7</xref>,<xref ref-type="bibr" rid="ref13">13</xref>,<xref ref-type="bibr" rid="ref14">14</xref>]. A lack of understanding can lead to patient safety concerns, particularly in relation to medication management decisions. Giardina et al [<xref ref-type="bibr" rid="ref15">15</xref>] conducted interviews with 93 patients and found that nearly two-thirds did not receive any explanation of their laboratory test results and 46% conducted web searches to understand their results better. Another study found that patients who were unable to assess the gravity of their test results were more likely to seek information on the internet or just wait for their physician to call [<xref ref-type="bibr" rid="ref14">14</xref>]. There are also potential results in which a lack of urgent action can lead to poor outcomes. For example, a lipid panel is a commonly ordered laboratory test that measures the amount of cholesterol and other fats in the blood. If left untreated, high cholesterol levels can lead to heart disease, stroke, coronary heart disease, sudden cardiac arrest, peripheral artery disease, and microvascular disease [<xref ref-type="bibr" rid="ref16">16</xref>,<xref ref-type="bibr" rid="ref17">17</xref>]. When patients have difficulty understanding laboratory test results from patient portals but do not have ready access to medical professionals, they often turn to web sources to answer their questions. Among the different web sources, social question-and-answer (Q&amp;A) websites allow patients to ask for personalized advice in an elaborative way or pose questions for real humans. However, the quality of answers to health-related questions on social Q&amp;A websites varies significantly, and not all responses are accurate or reliable [<xref ref-type="bibr" rid="ref18">18</xref>,<xref ref-type="bibr" rid="ref19">19</xref>].</p>
        <p>Previous studies, including our own, have explored different strategies for presenting numerical data to patients (eg, using reference ranges, tables, charts, color, text, and numerical data with verbal explanations [<xref ref-type="bibr" rid="ref9">9</xref>,<xref ref-type="bibr" rid="ref12">12</xref>,<xref ref-type="bibr" rid="ref20">20</xref>,<xref ref-type="bibr" rid="ref21">21</xref>]). Researchers have also studied ways to improve patients’ understanding of their laboratory test results. Kopanitsa [<xref ref-type="bibr" rid="ref22">22</xref>] studied how patients perceived interpretations of laboratory test results automatically generated by a clinical decision support system. They found that patients who received interpretations of abnormal test results had significantly higher rates of follow-up (71%) compared to those who received only test results without interpretations (49%). Patients appreciate the timeliness of the automatically generated interpretations compared to interpretations that they could receive from a physician. Zikmund-Fisher et al [<xref ref-type="bibr" rid="ref23">23</xref>] surveyed 1618 adults in the United States to assess how different visual presentations of laboratory test results influenced their perceived urgency. They found that a visual line display, which included both the standard range and a harm anchor reference point that many physicians may not consider as particularly concerning, reduced the perceived urgency of close-to-normal alanine aminotransferase and creatinine results (<italic>P</italic>&lt;.001). Morrow et al [<xref ref-type="bibr" rid="ref24">24</xref>] investigated whether providing verbally, graphically, and video-enhanced contexts for patient portal messages about laboratory test results could improve responses to the messages. They found that, compared to a standardized format, verbally and video-enhanced contexts improved older adults’ gist but not verbatim memory.</p>
        <p>Recent advances in artificial intelligence (AI)–based large language models (LLMs) have opened new avenues for enhancing patient education. LLMs are advanced AI systems that use deep learning techniques to process and generate natural language (eg, ChatGPT and GPT-4 developed by OpenAI) [<xref ref-type="bibr" rid="ref25">25</xref>]. These models have been trained on massive amounts of data, allowing them to recognize patterns and relationships between words and concepts. These are fine-tuned using both supervised and reinforcement techniques, allowing them to generate humanlike language that is coherent, contextually relevant, and grammatically correct based on given prompts. While LLMs such as ChatGPT have gained popularity, a recent study by the European Federation of Clinical Chemistry and Laboratory Medicine Working Group on AI showed that these may provide superficial or even incorrect answers to laboratory test result–related questions asked by professionals and, thus, cannot be used for diagnosis [<xref ref-type="bibr" rid="ref26">26</xref>]. Another recent study by Munoz-Zuluaga et al [<xref ref-type="bibr" rid="ref27">27</xref>] evaluated the ability of GPT-4 to answer laboratory test result interpretation questions from physicians in the laboratory medicine field. They found that, among 30 questions about laboratory test result interpretation, GPT-4 answered 46.7% correctly, provided incomplete or partially correct answers to 23.3%, and answered 30% incorrectly or irrelevantly. In addition, they found that ChatGPT’s responses were not sufficiently tailored to the case or clinical questions that are useful for clinical consultation.</p>
      </sec>
      <sec>
        <title>Objectives</title>
        <p>According to our previous analysis of laboratory test questions on a social Q&amp;A website [<xref ref-type="bibr" rid="ref28">28</xref>,<xref ref-type="bibr" rid="ref29">29</xref>], when patients ask laboratory test result–related questions on the web, they often focus on specific values, terminologies, or the cause of abnormal results. Some of them may provide symptoms, medications, medical history, and lifestyle information along with laboratory test results. Previous studies have only evaluated ChatGPT’s responses to laboratory test questions from physicians [<xref ref-type="bibr" rid="ref26">26</xref>,<xref ref-type="bibr" rid="ref27">27</xref>] or its ability to answer <italic>yes-or-no</italic> questions [<xref ref-type="bibr" rid="ref30">30</xref>]. To the best of our knowledge, there is no prior work that has evaluated the ability of LLMs to answer laboratory test questions raised by patients in social Q&amp;A websites. Hence, our goal was to compare the quality of answers from LLMs and social Q&amp;A website users to laboratory test–related questions and explore the feasibility of using LLMs to generate relevant, accurate, helpful, and unharmful responses to patients’ questions. In addition, we aimed to identify potential issues that could be mitigated using augmentation approaches.</p>
      </sec>
    </sec>
    <sec sec-type="methods">
      <title>Methods</title>
      <sec>
        <title>Overview</title>
        <p><xref rid="figure1" ref-type="fig">Figure 1</xref> illustrates the overall pipeline of the study, which consists of three steps: (1) data collection, (2) generation of responses from LLMs, and (3) evaluation of the responses using automated and manual approaches.</p>
        <fig id="figure1" position="float">
          <label>Figure 1</label>
          <caption>
            <p>Schematic representation of the study pipeline. AutoML: automated machine learning; BioBERT: biomedical Bidirectional Encoder Representations from Transformers; ClinicalBERT: clinical Bidirectional Encoder Representations from Transformers; PubMedBERT: PubMed-trained Bidirectional Encoder Representations from Transformers; SciBERT: scientific Bidirectional Encoder Representations from Transformers.</p>
          </caption>
          <graphic xlink:href="jmir_v26i1e56655_fig1.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Data Collection</title>
        <p>Yahoo! Answer is a community Q&amp;A forum. Its data include questions, responses, and ratings of the responses by other users. A question may have more than 1 answer. We used the answer with the highest rating as our chosen answer. To prepare the data set for this study, we first identified 12,975 questions that contained one or more laboratory test names. In our previous work [<xref ref-type="bibr" rid="ref31">31</xref>], we annotated key information about laboratory test results using 251 articles from a credible health information source, AHealthyMe. Key information included laboratory test names, alternative names, normal value range, abnormal value range, conditions of normal ranges, indications, and actions. However, questions that mention a laboratory test name may not be about the interpretation of test results. To identify questions that were about laboratory test result interpretation, 3 undergraduate students in the premedical track were recruited to manually label 500 randomly chosen questions regarding whether they were about laboratory result interpretation. We then trained 4 transformer-based classifiers (biomedical Bidirectional Encoder Representations from Transformers [BioBERT] [<xref ref-type="bibr" rid="ref32">32</xref>], clinical Bidirectional Encoder Representations from Transformers [ClinicalBERT] [<xref ref-type="bibr" rid="ref33">33</xref>], scientific Bidirectional Encoder Representations from Transformers [SciBERT] [<xref ref-type="bibr" rid="ref34">34</xref>], and PubMed-trained Bidirectional Encoder Representations from Transformers [PubMedBERT] [<xref ref-type="bibr" rid="ref35">35</xref>]) and various automated machine learning (autoML) models (XGBoost, NeuralNet, CatBoost, weighted ensemble, and LightGBM) to automatically identify laboratory test result interpretation–related questions from all 12,975 questions. We then worked with primary care physicians to select 53 questions from 100 random samples that contained results of blood or urine laboratory tests on major panels, including complete blood count, metabolic panel, thyroid function test, early menopause panel, and lipid panel. These questions must be written in English, involve multiple laboratory tests, cover a diverse set of laboratory tests, and be clear questions. We also manually examined all the questions and answers of these samples and did not find any identifiable information in them.</p>
      </sec>
      <sec>
        <title>Generating Responses From LLMs</title>
        <p>We identified 5 generative LLMs—OpenAI ChatGPT (GPT-4 version) [<xref ref-type="bibr" rid="ref36">36</xref>], OpenAI ChatGPT (GPT-3.5 version) [<xref ref-type="bibr" rid="ref37">37</xref>], LLaMA 2 (Meta AI) [<xref ref-type="bibr" rid="ref38">38</xref>], MedAlpaca [<xref ref-type="bibr" rid="ref39">39</xref>], and ORCA_mini [<xref ref-type="bibr" rid="ref40">40</xref>]—to evaluate in this study.</p>
        <p>GPT-4 [<xref ref-type="bibr" rid="ref36">36</xref>] is the fourth-generation generative pretrained transformer model from OpenAI. GPT-4 is a large-scale, multimodal LLM developed using reinforcement learning feedback from both humans and AI. The model is reported to have humanlike accuracy in various downstream tasks such as question answering, summarization, and other information extraction tasks based on both text and image data.</p>
        <p>GPT-3.5 [<xref ref-type="bibr" rid="ref37">37</xref>] is the third-generation chatbot from OpenAI trained using 175 billion parameters, 2048 context lengths, and 16-bit precision. ChatGPT version 3.5 received significant attention before the release of GPT-4 in March 2023. Using the reinforcement learning from human feedback approach, GPT-3.5 was fine-tuned and optimized using models such as text-davinci-003 and GPT-3.5 Turbo for chat. GPT-3.5 is currently available for free from the OpenAI application programming interface.</p>
        <p>LLaMA 2 [<xref ref-type="bibr" rid="ref38">38</xref>] is the second-generation open-source LLM from Meta AI, pretrained using 2 trillion tokens with 4096 token length. Meta AI released 3 versions of LLaMA 2 with 7, 13, and 70 billion parameters with fine-tuned models of the LLaMA 2 chat. The LLaMA 2 models reported high accuracy on many benchmarks, including Massive Multitask Language Understanding, programming code interpretation, reading comprehension, and open-book Q&amp;A compared to other open-source LLMs.</p>
        <p>MedAlpaca [<xref ref-type="bibr" rid="ref39">39</xref>] is an open-source LLM developed by expanding existing LLMs Stanford Alpaca and Alpaca-LoRA, fine-tuning them on a variety of medical texts. The model was developed as a medical chatbot within the scope of question answering and dialogue applications using various medical resources such as medical flash cards, WikiDoc patient information, Medical Sciences Stack Exchange, the US Medical Licensing Examination, Medical Question Answer, PubMed health advice, and ChatDoctor.</p>
        <p>ORCA_mini [<xref ref-type="bibr" rid="ref40">40</xref>] is an open-source LLM trained using data and instructions from various open-source LLMs such as WizardLM (trained with about 70,000 entries), Alpaca (trained with about 52,000 entries), and Dolly 2.0 (trained with about 15,000 entries). ORCA_mini is a fine-tuned model from OpenLLaMA 3B, which is Meta AI’s 7-billion–parameter LLaMA version trained on the RedPajama data set. The model leveraged various instruction-tuning approaches introduced in the original study, ORCA, a 13-billion–parameter model.</p>
        <p>LangChain [<xref ref-type="bibr" rid="ref41">41</xref>] is a framework for developing applications by leveraging LLMs. LangChain allows users to connect to a language model from a repository such as Hugging Face, deploy that model locally, and interact with it without any restrictions. LangChain enables the user to perform downstream tasks such as answering questions over specific documents and deploying chatbots and agents using the connected LLM. With the rise of open-source LLMs, LangChain is emerging as a robust framework to connect with various LLMs for user-specific tasks.</p>
        <p>We used the Hugging Face repository of 3 LLMs (LLaMA 2 [<xref ref-type="bibr" rid="ref37">37</xref>], MedAlpaca [<xref ref-type="bibr" rid="ref38">38</xref>], and ORCA_mini [<xref ref-type="bibr" rid="ref39">39</xref>]) to download the model weights and used LangChain input prompts to the models to generate the answers to the 53 selected questions. The answers were generated in a zero-shot setting without providing any examples to the models. The responses from GPT-4 and GPT-3.5 were obtained from the web-based ChatGPT application. <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> provides all the responses generated by these 5 LLMs and the human answers from Yahoo users.</p>
      </sec>
      <sec>
        <title>Automated Assessment of the Similarity of LLM Responses and Human Responses</title>
        <p>We first evaluated the answers using standard Q&amp;A intrinsic evaluation metrics that are widely used to assess the similarity of an answer to a given answer. These metrics include Bilingual Evaluation Understudy (BLEU), SacreBLEU, Metric for Evaluation of Translation With Explicit Ordering (METEOR), Recall-Oriented Understudy for Gisting Evaluation (ROUGE), and Bidirectional Encoder Representations from Transformers Score (BERTScore). <xref ref-type="boxed-text" rid="box1">Textbox 1</xref> describes the selected metrics. We used each LLM’s response and human response as the baseline.</p>
        <boxed-text id="box1" position="float">
          <title>Description of the standard question-and-answer evaluation metrics on answer similarity.</title>
          <p>
            <bold>Metric and description</bold>
          </p>
          <list list-type="bullet">
            <list-item>
              <p>Bilingual Evaluation Understudy (BLEU) [<xref ref-type="bibr" rid="ref42">42</xref>]: it is based on exact-string matching and counts n-gram overlap between the candidate and the reference.</p>
            </list-item>
            <list-item>
              <p>SacreBLEU [<xref ref-type="bibr" rid="ref43">43</xref>]: it produces the official Workshop on Statistical Machine Translation scores.</p>
            </list-item>
            <list-item>
              <p>Metric for Evaluation of Translation With Explicit Ordering (METEOR) [<xref ref-type="bibr" rid="ref44">44</xref>]: it is based on heuristic string matching and harmonic mean of unigram precision and recall. It computes exact match precision and exact match recall while allowing backing off from exact unigram matching to matching word stems, synonyms, and paraphrases. For example, running may be matched to run if no exact match is possible.</p>
            </list-item>
            <list-item>
              <p>Recall-Oriented Understudy for Gisting Evaluation (ROUGE) [<xref ref-type="bibr" rid="ref45">45</xref>]: it considers sentence-level structure similarity using the longest co-occurring subsequences between the candidate and the reference.</p>
            </list-item>
            <list-item>
              <p>Bidirectional Encoder Representations from Transformers Score (BERTScore) [<xref ref-type="bibr" rid="ref46">46</xref>]: it is based on the similarity of 2 sentences as a sum of cosine similarities between their tokens’ Bidirectional Encoder Representations from Transformers embeddings. The complete score matches each token in a reference sentence to a token in a candidate sentence to compute recall and each token in a candidate sentence to a token in a reference sentence to compute precision. It computes F1-scores based on precision and recall.</p>
            </list-item>
          </list>
        </boxed-text>
      </sec>
      <sec>
        <title>Quality Evaluation of the Answers Using Win Rate</title>
        <p>Previous studies [<xref ref-type="bibr" rid="ref47">47</xref>,<xref ref-type="bibr" rid="ref48">48</xref>] have shown the effectiveness of using LLMs to automatically evaluate the quality of generated texts. These evaluations are often conducted by comparing different aspects between the texts generated by a target model and a baseline model with a capable LLM judge such as GPT-4. The results are presented as a <italic>win rate</italic>, which denotes the percentage of the target model responses with better quality than their counterpart baseline model responses. In this study, we used the human responses as the comparison baseline and GPT-4 to determine whether a target model had higher quality in terms of relevance, correctness, helpfulness, and safety. These 4 aspects have been previously used in other studies [<xref ref-type="bibr" rid="ref26">26</xref>] that evaluated LLM responses to health-related questions.</p>
        <list list-type="order">
          <list-item>
            <p>Relevance (also known as “pertinency”): this aspect measures the coherence and consistency between AI’s interpretation and explanation and the test results presented. It pertains to the system’s ability to generate text that specifically addresses the case in question rather than unrelated or other cases.</p>
          </list-item>
          <list-item>
            <p>Correctness (also known as accuracy, truthfulness, or capability): this aspect refers to the scientific and technical accuracy of AI’s interpretation and explanation based on the best available medical evidence and laboratory medicine best practices. Correctness does not concern the case itself but solely the content provided in the response in terms of information accuracy.</p>
          </list-item>
          <list-item>
            <p>Helpfulness (also known as utility or alignment): this aspect encompasses both relevance and correctness, but it also considers the system’s ability to provide nonobvious insights for patients, nonspecialists, and laypeople. Helpfulness involves offering appropriate suggestions, delivering pertinent and accurate information, enhancing patient comprehension of test results, and primarily recommending actions that benefit the patient and optimize health care service use. This aspect aims to minimize false negatives; false positives; overdiagnosis; and overuse of health care resources, including physicians’ time. This is the most crucial quality dimension.</p>
          </list-item>
          <list-item>
            <p>Safety: this aspect addresses the potential negative consequences and detrimental effects of AI’s responses on the patient’s health and well-being. It considers any additional information that may adversely affect the patient.</p>
          </list-item>
        </list>
      </sec>
      <sec>
        <title>Manual Evaluation of the LLM Responses With Medical Professionals</title>
        <p>To gain deep insights into the quality of the LLM answers compared to the Yahoo web-based user answers, we selected 7 questions that focused on different panels or clinical specialties and asked 5 medical experts (4 primary care clinicians and an informatics postdoctoral trainee with a Doctor of Medicine degree) to evaluate the LLM answers and Yahoo! Answers’ user answers using 4 Likert-scale metrics (1=<italic>Very high</italic>, 2=<italic>High</italic>, 3=<italic>Neutral</italic>, 4=<italic>Low</italic>, and 5=<italic>Very low</italic>) by answering a Qualtrics (Qualtrics International Inc) survey. Their interrater reliability was also assessed.</p>
        <p>The intraclass correlation coefficient (ICC), first introduced by Bartko [<xref ref-type="bibr" rid="ref49">49</xref>], is a measure of reliability among multiple raters. The coefficients are calculated based on the variance among the variables of a common class. We used the R package <italic>irr</italic> (R Foundation for Statistical Computing) [<xref ref-type="bibr" rid="ref50">50</xref>] to calculate the ICC. In this study, the ICC score was calculated with the default setting in <italic>irr</italic> as an average score using a 1-way model with 95% CI. We passed the ratings as an <italic>n</italic> × <italic>m</italic> matrix as n=35 (7 questions × 5 LLMs) and m=5 evaluators to generate the agreement score for each metric. According to <xref ref-type="table" rid="table1">Table 1</xref>, the intraclass correlation among the evaluators was high enough, indicating that the agreement among the human expert evaluators was high.</p>
        <table-wrap position="float" id="table1">
          <label>Table 1</label>
          <caption>
            <p>Intraclass correlation for the 4 metrics among the 5 evaluators.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="280"/>
            <col width="500"/>
            <col width="220"/>
            <thead>
              <tr valign="top">
                <td>Metric</td>
                <td>Intraclass correlation (95% CI)</td>
                <td><italic>P</italic> value</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>Relevance</td>
                <td>0.567 (0.290-0.758)</td>
                <td>&lt;.001</td>
              </tr>
              <tr valign="top">
                <td>Correctness</td>
                <td>0.633 (0.398-0.795)</td>
                <td>&lt;.001</td>
              </tr>
              <tr valign="top">
                <td>Helpfulness</td>
                <td>0.588 (0.325-0.770)</td>
                <td>&lt;.001</td>
              </tr>
              <tr valign="top">
                <td>Potential harm</td>
                <td>0.579 (0.310-0.765)</td>
                <td>&lt;.001</td>
              </tr>
            </tbody>
          </table>
        </table-wrap>
      </sec>
      <sec>
        <title>Ethical Considerations</title>
        <p>This study was exempt from ethical oversight from our institutional review board because we used a publicly available deidentified data set [<xref ref-type="bibr" rid="ref51">51</xref>].</p>
      </sec>
    </sec>
    <sec sec-type="results">
      <title>Results</title>
      <sec>
        <title>Laboratory Test Question Classification</title>
        <p>We trained 4 transformer-based classifiers—BioBERT [<xref ref-type="bibr" rid="ref32">32</xref>], ClinicalBERT [<xref ref-type="bibr" rid="ref33">33</xref>], SciBERT [<xref ref-type="bibr" rid="ref34">34</xref>], and PubMedBERT [<xref ref-type="bibr" rid="ref35">35</xref>]—to automatically detect laboratory test result–related questions. The models were trained and tested using 500 manually labeled and randomly chosen questions. The data set was split into an 80:20 ratio of training to test sets. All the models were fine-tuned for 30 epochs with a batch size of 32 and an Adam weight decay optimizer with a learning rate of 0.01. <xref ref-type="table" rid="table2">Table 2</xref> shows the performance metrics of the classification models. The transformer model ClinicalBERT achieved the highest <italic>F</italic><sub>1</sub>-score of 0.761. The other models—SciBERT, BioBERT, and PubMedBERT—achieved <italic>F</italic><sub>1</sub>-scores of 0.711, 0.667, and 0.536, respectively. We also trained and evaluated autoML models, namely, XGBoost, NeuralNet, CatBoost, weighted ensemble, and LightGBM, using the AutoGluon package for the same task. We then used the fine-tuned ClinicalBERT and 5 autoML models to identify the relevant laboratory test questions from the initial set of 12,975 questions. The combination of a BERT model and a set of AutoGluon models was chosen to reduce the number of false-positive laboratory test questions. During the training and testing phases, we identified that the ClinicalBERT model performed better compared to other models such as PubMedBERT and BioBERT. Similarly, AutoGluon models such as tree-based boosted models (eg, XGBoost, a neural network model, and an ensemble model) performed with high accuracy. As these models’ architectures are different, we chose to include all models and selected the laboratory test questions only if all models predicted them as positive laboratory test questions. We then manually selected 53 questions from 5869 that were predicted as positive by the fine-tuned ClinicalBERT and the 5 autoML models and evaluated their LLM responses against each other.</p>
        <table-wrap position="float" id="table2">
          <label>Table 2</label>
          <caption>
            <p>Classification performance on laboratory test questions.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="30"/>
            <col width="370"/>
            <col width="200"/>
            <col width="200"/>
            <col width="200"/>
            <thead>
              <tr valign="top">
                <td colspan="2">Model</td>
                <td>Precision</td>
                <td>Recall</td>
                <td><italic>F</italic><sub>1</sub>-score</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td colspan="5">
                  <bold>Transformer</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>PubMedBERT<sup>a</sup></td>
                <td>0.523</td>
                <td>0.550</td>
                <td>0.536</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>BioBERT<sup>b</sup></td>
                <td>0.667</td>
                <td>0.667</td>
                <td>0.667</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>SciBERT<sup>c</sup></td>
                <td>0.666</td>
                <td>0.761</td>
                <td>0.711</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>ClinicalBERT<sup>d</sup></td>
                <td>0.761</td>
                <td>0.761</td>
                <td>
                  <italic>0.761<sup>e</sup></italic>
                </td>
              </tr>
              <tr valign="top">
                <td colspan="5">
                  <bold>AutoML<sup>f</sup> model</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>XGBoost<sup>g</sup></td>
                <td>0.846</td>
                <td>0.771</td>
                <td>0.807</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>NeuralNet</td>
                <td>0.846</td>
                <td>0.790</td>
                <td>0.817</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>CatBoost</td>
                <td>0.834</td>
                <td>0.820</td>
                <td>0.827</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Weighted ensemble</td>
                <td>0.865</td>
                <td>0.865</td>
                <td>0.865</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>LightGBM</td>
                <td>0.860</td>
                <td>0.870</td>
                <td>
                  <italic>0.865</italic>
                </td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table2fn1">
              <p><sup>a</sup>PubMedBERT: PubMed-trained Bidirectional Encoder Representation from Transformers.</p>
            </fn>
            <fn id="table2fn2">
              <p><sup>b</sup>BioBERT: biomedical Bidirectional Encoder Representation from Transformers.</p>
            </fn>
            <fn id="table2fn3">
              <p><sup>c</sup>SciBERT: scientific Bidirectional Encoder Representation from Transformers.</p>
            </fn>
            <fn id="table2fn4">
              <p><sup>d</sup>ClinicalBERT: clinical Bidirectional Encoder Representation from Transformers.</p>
            </fn>
            <fn id="table2fn5">
              <p><sup>e</sup>The highest value for the performance metric.</p>
            </fn>
            <fn id="table2fn6">
              <p><sup>f</sup>AutoML: automated machine learning.</p>
            </fn>
            <fn id="table2fn7">
              <p><sup>g</sup>XGBoost: Extreme Gradient Boosting.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
      </sec>
      <sec>
        <title>Basic Characteristics of the Data Set of 53 Question-Answer Pairs</title>
        <p><xref rid="figure2" ref-type="fig">Figure 2</xref> shows the responses from GPT-4 and Yahoo web-based users for an example laboratory result interpretation question from Yahoo! Answers. <xref ref-type="table" rid="table3">Table 3</xref> shows the frequency of laboratory tests among the selected 53 laboratory test result interpretation questions. <xref rid="figure3" ref-type="fig">Figure 3</xref> shows the frequency of the most frequent laboratory tests in each of the most frequent 10 medical conditions among the selected 53 laboratory test questions.</p>
        <fig id="figure2" position="float">
          <label>Figure 2</label>
          <caption>
            <p>Responses from GPT-4 and a human for an example laboratory result interpretation question from Yahoo! Answers.</p>
          </caption>
          <graphic xlink:href="jmir_v26i1e56655_fig2.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <table-wrap position="float" id="table3">
          <label>Table 3</label>
          <caption>
            <p>Frequency of laboratory tests in the questions (n=53).</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="500"/>
            <col width="500"/>
            <thead>
              <tr valign="top">
                <td>Laboratory test</td>
                <td>Frequency, n (%)</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>Triglycerides</td>
                <td>18 (34)</td>
              </tr>
              <tr valign="top">
                <td>Hemoglobin A<sub>1c</sub></td>
                <td>16 (30)</td>
              </tr>
              <tr valign="top">
                <td>Prostate-specific antigen</td>
                <td>9 (17)</td>
              </tr>
              <tr valign="top">
                <td>White blood cell count</td>
                <td>6 (11)</td>
              </tr>
              <tr valign="top">
                <td>Iron</td>
                <td>6 (11)</td>
              </tr>
              <tr valign="top">
                <td>Glucose</td>
                <td>6 (11)</td>
              </tr>
              <tr valign="top">
                <td>Creatinine</td>
                <td>6 (11)</td>
              </tr>
              <tr valign="top">
                <td>Alkaline phosphatase</td>
                <td>6 (11)</td>
              </tr>
              <tr valign="top">
                <td>Hematocrit</td>
                <td>5 (9)</td>
              </tr>
              <tr valign="top">
                <td>Bilirubin</td>
                <td>5 (9)</td>
              </tr>
              <tr valign="top">
                <td>Lipid profile</td>
                <td>4 (8)</td>
              </tr>
              <tr valign="top">
                <td>HDL<sup>a</sup> cholesterol</td>
                <td>4 (8)</td>
              </tr>
              <tr valign="top">
                <td>Aspartate aminotransferase</td>
                <td>4 (8)</td>
              </tr>
              <tr valign="top">
                <td>Total cholesterol</td>
                <td>3 (6)</td>
              </tr>
              <tr valign="top">
                <td>Blood urea nitrogen</td>
                <td>3 (6)</td>
              </tr>
              <tr valign="top">
                <td>Hepatitis B surface antigen</td>
                <td>2 (4)</td>
              </tr>
              <tr valign="top">
                <td>γ-glutamyl transferase</td>
                <td>2 (4)</td>
              </tr>
              <tr valign="top">
                <td>Albumin</td>
                <td>2 (4)</td>
              </tr>
              <tr valign="top">
                <td>Vitamin D</td>
                <td>1 (2)</td>
              </tr>
              <tr valign="top">
                <td>Ketones</td>
                <td>1 (2)</td>
              </tr>
              <tr valign="top">
                <td>Glucose tolerance test</td>
                <td>1 (2)</td>
              </tr>
              <tr valign="top">
                <td>Follicle-stimulating hormone</td>
                <td>1 (2)</td>
              </tr>
              <tr valign="top">
                <td>Estimated glomerular filtration rate</td>
                <td>1 (2)</td>
              </tr>
              <tr valign="top">
                <td>Erythrocyte sedimentation rate</td>
                <td>1 (2)</td>
              </tr>
              <tr valign="top">
                <td>Comprehensive metabolic panel</td>
                <td>1 (2)</td>
              </tr>
              <tr valign="top">
                <td>Anion gap</td>
                <td>1 (2)</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table3fn1">
              <p><sup>a</sup>HDL: high-density lipoprotein.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <fig id="figure3" position="float">
          <label>Figure 3</label>
          <caption>
            <p>Frequency of 26 laboratory tests for 10 medical conditions in the selected 53 laboratory test questions. HDL: high-density lipoprotein.</p>
          </caption>
          <graphic xlink:href="jmir_v26i1e56655_fig3.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <p><xref ref-type="table" rid="table4">Table 4</xref> shows the statistics of the responses to 53 questions from 5 LLMs and human users of Yahoo! Answers, including the average character count, sentence count, and word count per response. <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref> provides the distributions of the lengths of the responses. GPT-4 tended to have longer responses than the other LLMs, whereas the responses from human users on Yahoo! Answers tended to be shorter with respect to all 3 counts. On average, the character count of GPT-4 responses was 4 times that of human user responses on Yahoo! Answers.</p>
        <table-wrap position="float" id="table4">
          <label>Table 4</label>
          <caption>
            <p>Statistics of laboratory test result interpretation responses in terms of average character count, sentence count, and word count per response.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="250"/>
            <col width="250"/>
            <col width="250"/>
            <col width="250"/>
            <thead>
              <tr valign="top">
                <td>Model</td>
                <td>Character count, mean (SD)</td>
                <td>Sentence count, mean (SD)</td>
                <td>Word count, mean (SD)</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>Yahoo! user answer</td>
                <td>515 (621)</td>
                <td>6 (7)</td>
                <td>90 (107)</td>
              </tr>
              <tr valign="top">
                <td>MedAlpaca</td>
                <td>734 (324)</td>
                <td>8 (3)</td>
                <td>124 (54)</td>
              </tr>
              <tr valign="top">
                <td>ORCA_mini</td>
                <td>942 (292)</td>
                <td>9 (3)</td>
                <td>156 (47)</td>
              </tr>
              <tr valign="top">
                <td>LLaMA 2</td>
                <td>1308 (326)</td>
                <td>12 (3)</td>
                <td>212 (54)</td>
              </tr>
              <tr valign="top">
                <td>GPT-3.5</td>
                <td>2246 (345)</td>
                <td>19 (4)</td>
                <td>340 (51)</td>
              </tr>
              <tr valign="top">
                <td>GPT-4</td>
                <td>2207 (453)</td>
                <td>18 (4)</td>
                <td>333 (66)</td>
              </tr>
            </tbody>
          </table>
        </table-wrap>
      </sec>
      <sec>
        <title>Automated Comparison of Similarities in LLM Responses</title>
        <p>Automatic metrics were used to compare the similarity of the responses generated by the 5 LLMs (<xref rid="figure4" ref-type="fig">Figure 4</xref>), namely, BLEU, SacreBLEU, METEOR, ROUGE, and BERTScore. The evaluation was conducted by comparing the LLM-generated responses to a “ground-truth” answer. In <xref rid="figure4" ref-type="fig">Figure 4</xref>, column 1 provides the ground-truth answer, and column 2 provides the equivalent generated answers from the LLMs. We also included the human answers from Yahoo! Answers for this evaluation. For the automatic evaluation, we specifically used BLEU-1, BLEU-2, SacreBLEU, METEOR, ROUGE, and BERTScore, which have been previously used to evaluate the quality of question answering against a gold standard.</p>
        <fig id="figure4" position="float">
          <label>Figure 4</label>
          <caption>
            <p>Evaluation results of the responses of the large language models using automatic metrics. BERTScore: Bidirectional Encoder Representations from Transformers Score; BLEU: Bilingual Evaluation Understudy; METEOR: Metric for Evaluation of Translation With Explicit Ordering; ROUGE: Recall-Oriented Understudy for Gisting Evaluation.</p>
          </caption>
          <graphic xlink:href="jmir_v26i1e56655_fig4.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <p>All the metrics ranged from 0.0 to 1.0, where a higher score indicates that the LLM-generated answers are similar to the ground truth whereas a lower score suggests otherwise. The BLEU, METEOR, and ROUGE scores were generally lower, in the range of 0 to 0.37, whereas BERTScore values were generally higher, in the range of 0.46 to 0.63. This is because BLEU, METEOR, and ROUGE look for matching based on n-grams, heuristic string matching, or structure similarity using the longest co-occurring subsequences, respectively, whereas BERTScore uses cosine similarities of BERT embeddings of words. When GPT-4 was the reference answer, the response from GPT-3.5 was the most similar in all 6 metrics, followed by the LLaMA 2 response in 5 of the 6 metrics. Similarly, when GPT-3.5 was the reference answer, the response from GPT-4 was the most similar in 5 of the 6 metrics. LLaMA 2- and ORCA_mini–generated responses were similar, and MedAlpaca-generated answers scored lower compared to those of all other LLMs. Human answers from Yahoo data scored the lowest and, thus, as the least similar to the LLM-generated answers.</p>
        <p><xref ref-type="table" rid="table5">Table 5</xref> shows the win rates judged by GPT-4 against Yahoo users’ answers in different aspects. Overall, GPT-4 achieved the highest performance and was nearly 100% better than the human responses. This is not surprising given that most human answers were very short and some were just 1 sentence asking the user to see a physician. GPT-4 and GPT-3.5 were followed by LLaMA 2 and ORCA_mini with 70% to 80% win rates. MedAlpaca had the lowest performance of approximately 50% to 60% win rates, which were close to a tie with those of the human answers. The trends here were similar to those of the human evaluation results, indicating that the GPT-4 evaluator can be a scalable and reliable solution for judging the quality of model-generated texts in this scenario.</p>
        <table-wrap position="float" id="table5">
          <label>Table 5</label>
          <caption>
            <p>Win rate evaluation results.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="400"/>
            <col width="150"/>
            <col width="150"/>
            <col width="150"/>
            <col width="150"/>
            <thead>
              <tr valign="top">
                <td>Win rate against human answers (evaluated by GPT-4)</td>
                <td>Relevance</td>
                <td>Correctness</td>
                <td>Helpfulness</td>
                <td>Less harm</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>MedAlpaca</td>
                <td>50.9</td>
                <td>54.9</td>
                <td>54.9</td>
                <td>54.9</td>
              </tr>
              <tr valign="top">
                <td>ORCA_mini</td>
                <td>78.4</td>
                <td>74.5</td>
                <td>84.3</td>
                <td>84.3</td>
              </tr>
              <tr valign="top">
                <td>LLaMA 2</td>
                <td>82.3</td>
                <td>80.3</td>
                <td>86.2</td>
                <td>70.5</td>
              </tr>
              <tr valign="top">
                <td>GPT-3.5</td>
                <td>98.0</td>
                <td>100.0</td>
                <td>98.0</td>
                <td>96.0</td>
              </tr>
              <tr valign="top">
                <td>GPT-4</td>
                <td>96.0</td>
                <td>98.0</td>
                <td>98.0</td>
                <td>98.0</td>
              </tr>
            </tbody>
          </table>
        </table-wrap>
      </sec>
      <sec>
        <title>Manual Evaluation With Medical Experts</title>
        <p><xref rid="figure5" ref-type="fig">Figure 5</xref> illustrates the manual evaluation results of the LLM responses and human responses by 5 medical experts. Note that a lower value means a higher score. It is obvious that GPT-4 responses significantly outperformed all the other LLMs’ responses and human responses in all 4 aspects. <xref ref-type="boxed-text" rid="box2">Textbox 2</xref> shows experts’ feedback on the LLM and human responses. The medical experts also identified inaccurate information in LLM responses. A few observations from the medical experts are listed in <xref ref-type="supplementary-material" rid="app3">Multimedia Appendix 3</xref>.</p>
        <fig id="figure5" position="float">
          <label>Figure 5</label>
          <caption>
            <p>Manual evaluation of the large language model (LLM) and human responses. Lower scores denote better capabilities.</p>
          </caption>
          <graphic xlink:href="jmir_v26i1e56655_fig5.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <boxed-text id="box2" position="float">
          <title>Human experts’ feedback on the large language model and human responses.</title>
          <p>
            <bold>Large language model or human answer and expert feedback</bold>
          </p>
          <list list-type="bullet">
            <list-item>
              <p>LLaMA 2: “It is a great answer. He was able to explain in details the results. He provides inside on the different differential diagnosis. And provide alternative a management. He shows empathy.”</p>
            </list-item>
            <list-item>
              <p>LLaMA 2: “Very thorough and thoughtful.”</p>
            </list-item>
            <list-item>
              <p>ORCA_mini: “It was a great answer. He explained in detail test results, discussed differential diagnosis, but in a couple of case he was too aggressive in regards his recommendations.”</p>
            </list-item>
            <list-item>
              <p>ORCA_mini: “Standard answers, not the most in depth.”</p>
            </list-item>
            <list-item>
              <p>GPT-4: “It was honest the fact he introduced himself as he was not a physician. He proved extensive explanation of possible cause of abnormal labs and discussed well the recommendations.”</p>
            </list-item>
            <list-item>
              <p>GPT-4: “Too wordy at times, gets irrelevant.”</p>
            </list-item>
            <list-item>
              <p>GPT-3.5: “Strong responses in general.”</p>
            </list-item>
            <list-item>
              <p>GPT-3.5: “Clear and some way informative and helpful to pts.”</p>
            </list-item>
            <list-item>
              <p>GPT-3.5: “In most cases, this LLM stated that it was not a medical professional and accurately encouraged a discussion with a medical professional for further information and testing. The information provided was detailed and specific to what was being asked as well as helpful.”</p>
            </list-item>
            <list-item>
              <p>MedAlpaca: “This statement seems so sure that he felt superficial. It made me feel he did not provide enough information. It felt not safe for the patient.”</p>
            </list-item>
            <list-item>
              <p>MedAlpaca: “Short and succinct. condescending at times.”</p>
            </list-item>
            <list-item>
              <p>Human answer: “These were not very helpful or accurate. Most did not state their credentials to know how credible they are. Some of the, if not most, of language learning models gave better answers, though some of the language learning models also claimed to be medical professionals—which isn’t accurate statement either.”</p>
            </list-item>
            <list-item>
              <p>Human answer: “Usually focused on one aspect of the scenario, not helpful in comprehensive care. focused on isolated lab value, with minimal evidence—these can be harmful responses for patients.”</p>
            </list-item>
            <list-item>
              <p>Human answer: “These are really bad answers.”</p>
            </list-item>
            <list-item>
              <p>Human answer: “Some of the answer were helpful, other not much, and other offering options that might not need to be indicated.”</p>
            </list-item>
          </list>
        </boxed-text>
      </sec>
    </sec>
    <sec sec-type="discussion">
      <title>Discussion</title>
      <sec>
        <title>Principal Findings</title>
        <p>This study evaluated the feasibility of using generative LLMs to answer patients’ laboratory test result questions using 53 patients’ questions on a social Q&amp;A website, Yahoo! Answers. On the basis of the results of our study, GPT-4 outperformed other similar LLMs (ie, GPT-3.5, LLaMA 2, ORCA_mini, and MedAlpaca) according to both automated metrics and manual evaluation. In particular, GPT-4 always provided disclaimers, possibly to avoid legal issues. However, GPT-4 responses may also suffer from lack of interpretation of one’s medical context, incorrect statements, and lack of references.</p>
        <p>Recent studies [<xref ref-type="bibr" rid="ref26">26</xref>,<xref ref-type="bibr" rid="ref27">27</xref>] regarding the use of LLMs to answer laboratory test result questions from medical professionals found that ChatGPT may give superficial or incorrect answers to laboratory test result–related questions and can only provide accurate answers to approximately 50% of questions [<xref ref-type="bibr" rid="ref26">26</xref>]. They also found that ChatGPT’s responses were not sufficiently tailored to the case or clinical questions to be useful for clinical consultation. For instance, diagnoses of liver injury were made solely based on γ-glutamyl transferase levels without considering other liver enzyme indicators. In addition, high levels of glucose and glycated hemoglobin (HbA<sub>1c</sub>) were both identified as indicative of diabetes regardless of whether HbA<sub>1c</sub> levels were normal or elevated. These studies also highlighted that GPT-4 failed to account for preanalytical factors such as fasting status for glucose tests and struggled to differentiate between abnormal and critically abnormal laboratory test values. Our study observed similar patterns, where a normal HbA<sub>1c</sub> level coupled with high glucose levels led to a diabetes prediction and critically low iron levels were merely classified as abnormal.</p>
        <p>In addition, our findings also show that GPT-4 accurately distinguished between normal, prediabetic, and diabetic HbA<sub>1c</sub> ranges considering fasting glucose levels and preanalytical conditions such as fasting status. Furthermore, in cases of elevated bilirubin levels, GPT-4 correctly associated them with potential jaundice citing the patient’s yellow eye discoloration and appropriately considered a comprehensive set of laboratory test results—including elevated liver enzymes and bilirubin levels—and significant alcohol intake history to recommend diagnoses such as alcoholic liver disease, hepatitis, bile duct obstruction, and liver cancer.</p>
        <p>On the basis of our observation with the limited number of questions, we found that patients’ questions are often less complex than professionals’ questions, making ChatGPT more likely to provide an adequately accurate answer to such questions. In our manual evaluation of 7 selected patients’ laboratory test result questions, 91% (32/35) of the ratings from 5 medical experts on GPT-4’s response accuracy were either 1 (<italic>very high</italic>) or 2 (<italic>high</italic>).</p>
        <p>Through this study, we gained insights into the challenges of using generative LLMs to answer patients’ laboratory test result–related questions and provide suggestions to mitigate these challenges. First, when asking laboratory test result questions on social Q&amp;A websites, patients tend to focus on laboratory test results but may not provide pertinent information needed for result interpretation. In the real-world clinical setting, to fully evaluate the results, clinicians may need to evaluate the medical history of a patient and examine the trends of the laboratory test results over time. This shows that, to allow LLMs to provide a more thorough evaluation of laboratory test results, the question prompts may need to be augmented with additional information. As such, LLMs could be useful in prompting patients to provide additional information. A possible question prompt would be the following: “What additional information or data would you need to provide a more accurate diagnosis for me?”</p>
        <p>Second, we found that it is important to understand the limitations of LLMs when answering laboratory test–related questions. As general-purpose generative AI models, they should be used to explain common terminologies and test purposes; clarify the typical reference ranges for common laboratory tests and what it might mean to have values outside these ranges; and offer general interpretation of laboratory test results, such as what it might mean to have high or low levels in certain common laboratory tests. On the basis of our findings, LLMs, especially GPT-4, can provide a basic interpretation of laboratory test results without reference ranges in the question prompts. LLMs could also be used to suggest what questions to ask health care providers. They should not be used for diagnostic purposes or treatment advice. All laboratory test results should be interpreted by a health care professional who can consider the full context of one’s health. For providers, LLMs could also be used as an educational tool for laboratory professionals, providing real-time information and explanations of laboratory techniques. When using LLMs for laboratory test result interpretation, it is important to consider the ethical and practical implications, including data privacy, the need for human oversight, and the potential for AI to both enhance and disrupt clinical workflows.</p>
        <p>Third, we found it challenging to evaluate laboratory test result questions using Q&amp;A pairs from social Q&amp;A websites such as Yahoo! Answers. This is mainly because the answers provided by web-based users (who may not be medical professionals) were generally short, often focused on one aspect of the question or isolated laboratory tests, possibly opinionated, and possibly inaccurate with minimal evidence. Therefore, it is unlikely that human answers from social Q&amp;A websites can be used as a gold standard to evaluate LLM answers. We found that GPT-4 can provide comprehensive, thoughtful, sympathetic, and fairly accurate interpretation of individual laboratory tests, but it still suffers from a number of problems: (1) LLM answers are not individualized, (2) it is not clear what are the sources LLMs use to generate the answers, (3) LLMs do not ask clarifying questions if the provided prompts do not contain important information for LLMs to generate responses, and (4) validation by medical experts is needed to reduce hallucination and fill in missing information to ensure the quality of the responses.</p>
      </sec>
      <sec>
        <title>Future Directions</title>
        <p>We would like to point out a few ways to improve the quality of LLM responses to laboratory test–related questions. First, the interpretation of certain laboratory tests is dependent on age group, gender, and possibly other conditions pertaining to particular population subgroups (eg, pregnant women), but LLMs do not ask clarifying questions, so it is important to enrich the question prompts with necessary information available in electronic health records or ask patients to provide necessary information for more accurate interpretation. Second, it is also important to have medical professionals to review and edit the LLM responses. For example, we found that LLaMA 2 self-identified as a “health expert,” which is obviously problematic if such responses were directly sent to patients. Therefore, it is important to postprocess the responses to highlight sentences that are risky. Third, LLMs are sensitive to question prompts. We could study different prompt engineering and structuring strategies (eg, role prompting and chain of thought) and evaluate whether these prompting approaches would improve the quality of the answers. Fourth, one could also collect clinical guidelines that provide credible laboratory result interpretation to further train LLMs to improve answer quality. We could then leverage the retrieval-augmented generation approach to allow LLMs to generate responses from a limited set of credible information sources [<xref ref-type="bibr" rid="ref52">52</xref>]. Fifth, we could evaluate the confidence level of the sentences in the responses. Sixth, a gold-standard benchmark Q&amp;A data set for laboratory result interpretation could be developed to allow the community to advance with different augmentation approaches.</p>
      </sec>
      <sec>
        <title>Limitations</title>
        <p>A few limitations should be noted in this study. First, the ChatGPT web version is nondeterministic in that the same prompt may generate different responses when used by different users. Second, the sample size for the human evaluation was small. Nonetheless, this study produced evidence that LLMs such as GPT-4 can be a promising tool for filling the information gap for understanding laboratory tests and various approaches can be used to enhance the quality of the responses.</p>
      </sec>
      <sec>
        <title>Conclusions</title>
        <p>In this study, we evaluated the feasibility of using generative LLMs to answer common laboratory test result interpretation questions from patients. We generated responses from 5 LLMs—ChatGPT (GPT-4 version and GPT-3.5 version), LLaMA 2, MedAlpaca, and ORCA_mini—for laboratory test questions selected from Yahoo! Answers and evaluated these responses using both automated metrics and manual evaluation. We found that GPT-4 performed better compared to the other LLMs in generating more accurate, helpful, relevant, and safe answers to these questions. We also identified a number of ways to improve the quality of LLM responses from both the prompt and response sides.</p>
      </sec>
    </sec>
  </body>
  <back>
    <app-group>
      <supplementary-material id="app1">
        <label>Multimedia Appendix 1</label>
        <p>The responses generated by the 5 large language models and the human answers from Yahoo users.</p>
        <media xlink:href="jmir_v26i1e56655_app1.xlsx" xlink:title="XLSX File  (Microsoft Excel File), 169 KB"/>
      </supplementary-material>
      <supplementary-material id="app2">
        <label>Multimedia Appendix 2</label>
        <p>Distribution of the lengths of the responses.</p>
        <media xlink:href="jmir_v26i1e56655_app2.pdf" xlink:title="PDF File  (Adobe PDF File), 782 KB"/>
      </supplementary-material>
      <supplementary-material id="app3">
        <label>Multimedia Appendix 3</label>
        <p>A few observations from the medical experts regarding the accuracy of the large language model responses.</p>
        <media xlink:href="jmir_v26i1e56655_app3.pdf" xlink:title="PDF File  (Adobe PDF File), 96 KB"/>
      </supplementary-material>
    </app-group>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">AI</term>
          <def>
            <p>artificial intelligence</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb2">autoML</term>
          <def>
            <p>automated machine learning</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb3">BERTScore</term>
          <def>
            <p>Bidirectional Encoder Representations from Transformers Score</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb4">BioBERT</term>
          <def>
            <p>biomedical Bidirectional Encoder Representations from Transformers</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb5">BLEU</term>
          <def>
            <p>Bilingual Evaluation Understudy</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb6">ClinicalBERT</term>
          <def>
            <p>clinical Bidirectional Encoder Representations from Transformers</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb7">HbA<sub>1c</sub></term>
          <def>
            <p>glycated hemoglobin</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb8">ICC</term>
          <def>
            <p>intraclass correlation coefficient</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb9">LLM</term>
          <def>
            <p>large language model</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb10">METEOR</term>
          <def>
            <p>Metric for Evaluation of Translation With Explicit Ordering</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb11">PubMedBERT</term>
          <def>
            <p>PubMed-trained Bidirectional Encoder Representations from Transformers</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb12">Q&amp;A</term>
          <def>
            <p>question-and-answer</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb13">ROUGE</term>
          <def>
            <p>Recall-Oriented Understudy for Gisting Evaluation</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb14">SciBERT</term>
          <def>
            <p>scientific Bidirectional Encoder Representations from Transformers</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <ack>
      <p>This project was partially supported by the University of Florida Clinical and Translational Science Institute, which is supported in part by the National Institutes of Health (NIH) National Center for Advancing Translational Sciences under award UL1TR001427, as well as the Agency for Healthcare Research and Quality (AHRQ) under award R21HS029969. This study was supported by the NIH Intramural Research Program, National Library of Medicine (QJ and ZL). The content is solely the responsibility of the authors and does not necessarily represent the official views of the NIH and AHRQ. The authors would like to thank Angelique Deville, Caroline Bennett, Hailey Thompson, and Maggie Awad for labeling the questions for the question classification model.</p>
    </ack>
    <notes>
      <sec>
        <title>Data Availability</title>
        <p>The data sets generated during and analyzed during this study are available from the corresponding author on reasonable request.</p>
      </sec>
    </notes>
    <fn-group>
      <fn fn-type="conflict">
        <p>QJ is a coauthor and an active associate editor for the <italic>Journal of Medical Internet Research</italic>. All other authors declare no other conflicts of interest.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="web">
          <article-title>Healthy people 2030: building a healthier future for all</article-title>
          <source>Office of Disease Prevention and Health Promotion</source>
          <access-date>2023-05-09</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://health.gov/healthypeople">https://health.gov/healthypeople</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="web">
          <article-title>NHE fact sheet</article-title>
          <source>Centers for Medicare &amp; Medicaid Services</source>
          <access-date>2023-06-06</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.cms.gov/research-statistics-data-and-systems/statistics-trends-and-reports/nationalhealthexpenddata/nhe-fact-sheet">https://tinyurl.com/yc4durw4</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Bauer</surname>
              <given-names>UE</given-names>
            </name>
            <name name-style="western">
              <surname>Briss</surname>
              <given-names>PA</given-names>
            </name>
            <name name-style="western">
              <surname>Goodman</surname>
              <given-names>RA</given-names>
            </name>
            <name name-style="western">
              <surname>Bowman</surname>
              <given-names>BA</given-names>
            </name>
          </person-group>
          <article-title>Prevention of chronic disease in the 21st century: elimination of the leading preventable causes of premature death and disability in the USA</article-title>
          <source>Lancet</source>
          <year>2014</year>
          <month>07</month>
          <volume>384</volume>
          <issue>9937</issue>
          <fpage>45</fpage>
          <lpage>52</lpage>
          <pub-id pub-id-type="doi">10.1016/s0140-6736(14)60648-6</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <collab>Centers for Medicare and Medicaid Services (CMS)</collab>
            <collab>Centers for Disease Control and Prevention (CDC)</collab>
            <collab>Office for Civil Rights (OCR)</collab>
          </person-group>
          <article-title>CLIA program and HIPAA privacy rule; patients' access to test reports. Final rule</article-title>
          <source>Fed Regist</source>
          <year>2014</year>
          <month>02</month>
          <day>06</day>
          <volume>79</volume>
          <issue>25</issue>
          <fpage>7289</fpage>
          <lpage>316</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://www.gpo.gov/fdsys/pkg/FR-2014-02-06/pdf/2014-02280.pdf"/>
          </comment>
          <pub-id pub-id-type="medline">24605389</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Pillemer</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Price</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Paone</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Martich</surname>
              <given-names>GD</given-names>
            </name>
            <name name-style="western">
              <surname>Albert</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Haidari</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Updike</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Rudin</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Mehrotra</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Direct release of test results to patients increases patient engagement and utilization of care</article-title>
          <source>PLoS One</source>
          <year>2016</year>
          <volume>11</volume>
          <issue>6</issue>
          <fpage>e0154743</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://dx.plos.org/10.1371/journal.pone.0154743"/>
          </comment>
          <pub-id pub-id-type="doi">10.1371/journal.pone.0154743</pub-id>
          <pub-id pub-id-type="medline">27337092</pub-id>
          <pub-id pub-id-type="pii">PONE-D-14-55360</pub-id>
          <pub-id pub-id-type="pmcid">PMC4919031</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref6">
        <label>6</label>
        <nlm-citation citation-type="web">
          <article-title>Health IT legislation: 21st century cures act</article-title>
          <source>Office of the National Coordinator for Health Information Technology</source>
          <access-date>2023-02-19</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.healthit.gov/topic/laws-regulation-and-policy/health-it-legislation">https://www.healthit.gov/topic/laws-regulation-and-policy/health-it-legislation</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref7">
        <label>7</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Tsai</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Bell</surname>
              <given-names>EJ</given-names>
            </name>
            <name name-style="western">
              <surname>Woo</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Baldwin</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Pfeffer</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>How patients use a patient portal: an institutional case study of demographics and usage patterns</article-title>
          <source>Appl Clin Inform</source>
          <year>2019</year>
          <month>01</month>
          <day>06</day>
          <volume>10</volume>
          <issue>1</issue>
          <fpage>96</fpage>
          <lpage>102</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://www.thieme-connect.com/DOI/DOI?10.1055/s-0038-1677528"/>
          </comment>
          <pub-id pub-id-type="doi">10.1055/s-0038-1677528</pub-id>
          <pub-id pub-id-type="medline">30727003</pub-id>
          <pub-id pub-id-type="pmcid">PMC6365289</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref8">
        <label>8</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Witteman</surname>
              <given-names>HO</given-names>
            </name>
            <name name-style="western">
              <surname>Zikmund-Fisher</surname>
              <given-names>BJ</given-names>
            </name>
          </person-group>
          <article-title>Communicating laboratory results to patients and families</article-title>
          <source>Clin Chem Lab Med</source>
          <year>2019</year>
          <month>02</month>
          <day>25</day>
          <volume>57</volume>
          <issue>3</issue>
          <fpage>359</fpage>
          <lpage>64</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.degruyter.com/document/doi/10.1515/cclm-2018-0634"/>
          </comment>
          <pub-id pub-id-type="doi">10.1515/cclm-2018-0634</pub-id>
          <pub-id pub-id-type="medline">30407910</pub-id>
          <pub-id pub-id-type="pii">cclm-2018-0634</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref9">
        <label>9</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Turchioe</surname>
              <given-names>MR</given-names>
            </name>
            <name name-style="western">
              <surname>Myers</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Isaac</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Baik</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Grossman</surname>
              <given-names>LV</given-names>
            </name>
            <name name-style="western">
              <surname>Ancker</surname>
              <given-names>JS</given-names>
            </name>
            <name name-style="western">
              <surname>Creber</surname>
              <given-names>RM</given-names>
            </name>
          </person-group>
          <article-title>A systematic review of patient-facing visualizations of personal health data</article-title>
          <source>Appl Clin Inform</source>
          <year>2019</year>
          <month>08</month>
          <day>09</day>
          <volume>10</volume>
          <issue>4</issue>
          <fpage>751</fpage>
          <lpage>70</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/31597182"/>
          </comment>
          <pub-id pub-id-type="doi">10.1055/s-0039-1697592</pub-id>
          <pub-id pub-id-type="medline">31597182</pub-id>
          <pub-id pub-id-type="pmcid">PMC6785326</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref10">
        <label>10</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Alpert</surname>
              <given-names>JM</given-names>
            </name>
            <name name-style="western">
              <surname>Krist</surname>
              <given-names>AH</given-names>
            </name>
            <name name-style="western">
              <surname>Aycock</surname>
              <given-names>RA</given-names>
            </name>
            <name name-style="western">
              <surname>Kreps</surname>
              <given-names>GL</given-names>
            </name>
          </person-group>
          <article-title>Applying multiple methods to comprehensively evaluate a patient portal's effectiveness to convey information to patients</article-title>
          <source>J Med Internet Res</source>
          <year>2016</year>
          <month>05</month>
          <day>17</day>
          <volume>18</volume>
          <issue>5</issue>
          <fpage>e112</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.jmir.org/2016/5/e112/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/jmir.5451</pub-id>
          <pub-id pub-id-type="medline">27188953</pub-id>
          <pub-id pub-id-type="pii">v18i5e112</pub-id>
          <pub-id pub-id-type="pmcid">PMC4887660</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref11">
        <label>11</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zikmund-Fisher</surname>
              <given-names>BJ</given-names>
            </name>
            <name name-style="western">
              <surname>Exe</surname>
              <given-names>NL</given-names>
            </name>
            <name name-style="western">
              <surname>Witteman</surname>
              <given-names>HO</given-names>
            </name>
          </person-group>
          <article-title>Numeracy and literacy independently predict patients' ability to identify out-of-range test results</article-title>
          <source>J Med Internet Res</source>
          <year>2014</year>
          <month>08</month>
          <day>08</day>
          <volume>16</volume>
          <issue>8</issue>
          <fpage>e187</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.jmir.org/2014/8/e187/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/jmir.3241</pub-id>
          <pub-id pub-id-type="medline">25135688</pub-id>
          <pub-id pub-id-type="pii">v16i8e187</pub-id>
          <pub-id pub-id-type="pmcid">PMC4137189</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref12">
        <label>12</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Citardi</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Xing</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Luo</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Lu</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>He</surname>
              <given-names>Z</given-names>
            </name>
          </person-group>
          <article-title>Patient challenges and needs in comprehending laboratory test results: mixed methods study</article-title>
          <source>J Med Internet Res</source>
          <year>2020</year>
          <month>12</month>
          <day>07</day>
          <volume>22</volume>
          <issue>12</issue>
          <fpage>e18725</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.jmir.org/2020/12/e18725/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/18725</pub-id>
          <pub-id pub-id-type="medline">33284117</pub-id>
          <pub-id pub-id-type="pii">v22i12e18725</pub-id>
          <pub-id pub-id-type="pmcid">PMC7752528</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref13">
        <label>13</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Fraccaro</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Vigo</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Balatsoukas</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>van der Veer</surname>
              <given-names>SN</given-names>
            </name>
            <name name-style="western">
              <surname>Hassan</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Williams</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Wood</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Sinha</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Buchan</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Peek</surname>
              <given-names>N</given-names>
            </name>
          </person-group>
          <article-title>Presentation of laboratory test results in patient portals: influence of interface design on risk interpretation and visual search behaviour</article-title>
          <source>BMC Med Inform Decis Mak</source>
          <year>2018</year>
          <month>02</month>
          <day>12</day>
          <volume>18</volume>
          <issue>1</issue>
          <fpage>11</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://bmcmedinformdecismak.biomedcentral.com/articles/10.1186/s12911-018-0589-7"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/s12911-018-0589-7</pub-id>
          <pub-id pub-id-type="medline">29433495</pub-id>
          <pub-id pub-id-type="pii">10.1186/s12911-018-0589-7</pub-id>
          <pub-id pub-id-type="pmcid">PMC5809992</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref14">
        <label>14</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Bar-Lev</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Beimel</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>Numbers, graphs and words - do we really understand the lab test results accessible via the patient portals?</article-title>
          <source>Isr J Health Policy Res</source>
          <year>2020</year>
          <month>10</month>
          <day>28</day>
          <volume>9</volume>
          <issue>1</issue>
          <fpage>58</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://ijhpr.biomedcentral.com/articles/10.1186/s13584-020-00415-z"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/s13584-020-00415-z</pub-id>
          <pub-id pub-id-type="medline">33115536</pub-id>
          <pub-id pub-id-type="pii">10.1186/s13584-020-00415-z</pub-id>
          <pub-id pub-id-type="pmcid">PMC7592036</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref15">
        <label>15</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Giardina</surname>
              <given-names>TD</given-names>
            </name>
            <name name-style="western">
              <surname>Baldwin</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Nystrom</surname>
              <given-names>DT</given-names>
            </name>
            <name name-style="western">
              <surname>Sittig</surname>
              <given-names>DF</given-names>
            </name>
            <name name-style="western">
              <surname>Singh</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>Patient perceptions of receiving test results via online portals: a mixed-methods study</article-title>
          <source>J Am Med Inform Assoc</source>
          <year>2018</year>
          <month>04</month>
          <day>01</day>
          <volume>25</volume>
          <issue>4</issue>
          <fpage>440</fpage>
          <lpage>6</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/29240899"/>
          </comment>
          <pub-id pub-id-type="doi">10.1093/jamia/ocx140</pub-id>
          <pub-id pub-id-type="medline">29240899</pub-id>
          <pub-id pub-id-type="pii">4689172</pub-id>
          <pub-id pub-id-type="pmcid">PMC5885801</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref16">
        <label>16</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Doi</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Langsted</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Nordestgaard</surname>
              <given-names>BG</given-names>
            </name>
          </person-group>
          <article-title>Elevated remnant cholesterol reclassifies risk of ischemic heart disease and myocardial infarction</article-title>
          <source>J Am Coll Cardiol</source>
          <year>2022</year>
          <month>06</month>
          <day>21</day>
          <volume>79</volume>
          <issue>24</issue>
          <fpage>2383</fpage>
          <lpage>97</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://linkinghub.elsevier.com/retrieve/pii/S0735-1097(22)04818-5"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.jacc.2022.03.384</pub-id>
          <pub-id pub-id-type="medline">35710189</pub-id>
          <pub-id pub-id-type="pii">S0735-1097(22)04818-5</pub-id>
          <pub-id pub-id-type="pmcid">PMC8972554</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref17">
        <label>17</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wadström</surname>
              <given-names>BN</given-names>
            </name>
            <name name-style="western">
              <surname>Wulff</surname>
              <given-names>AB</given-names>
            </name>
            <name name-style="western">
              <surname>Pedersen</surname>
              <given-names>KM</given-names>
            </name>
            <name name-style="western">
              <surname>Jensen</surname>
              <given-names>GB</given-names>
            </name>
            <name name-style="western">
              <surname>Nordestgaard</surname>
              <given-names>BG</given-names>
            </name>
          </person-group>
          <article-title>Elevated remnant cholesterol increases the risk of peripheral artery disease, myocardial infarction, and ischaemic stroke: a cohort-based study</article-title>
          <source>Eur Heart J</source>
          <year>2022</year>
          <month>09</month>
          <day>07</day>
          <volume>43</volume>
          <issue>34</issue>
          <fpage>3258</fpage>
          <lpage>69</lpage>
          <pub-id pub-id-type="doi">10.1093/eurheartj/ehab705</pub-id>
          <pub-id pub-id-type="medline">34661640</pub-id>
          <pub-id pub-id-type="pii">6399975</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref18">
        <label>18</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Chu</surname>
              <given-names>SK</given-names>
            </name>
            <name name-style="western">
              <surname>Huang</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Wong</surname>
              <given-names>WN</given-names>
            </name>
            <name name-style="western">
              <surname>van Ginneken</surname>
              <given-names>WF</given-names>
            </name>
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>KM</given-names>
            </name>
            <name name-style="western">
              <surname>Hung</surname>
              <given-names>MY</given-names>
            </name>
          </person-group>
          <article-title>Quality and clarity of health information on Q and A sites</article-title>
          <source>Libr Inf Sci Res</source>
          <year>2018</year>
          <month>07</month>
          <volume>40</volume>
          <issue>3-4</issue>
          <fpage>237</fpage>
          <lpage>44</lpage>
          <pub-id pub-id-type="doi">10.1016/j.lisr.2018.09.005</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref19">
        <label>19</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Oh</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Yi</surname>
              <given-names>YJ</given-names>
            </name>
            <name name-style="western">
              <surname>Worrall</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Quality of health answers in social Q and A</article-title>
          <source>Proc Assoc Inf Sci Technol</source>
          <year>2013</year>
          <month>01</month>
          <day>24</day>
          <volume>49</volume>
          <issue>1</issue>
          <fpage>1</fpage>
          <lpage>6</lpage>
          <pub-id pub-id-type="doi">10.1002/meet.14504901075</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref20">
        <label>20</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Tao</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Yuan</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Qu</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>X</given-names>
            </name>
          </person-group>
          <article-title>Presentation of personal health information for consumers: an experimental comparison of four visualization formats</article-title>
          <source>Proceedings of the 15th International Conference on Engineering Psychology and Cognitive Ergonomics</source>
          <year>2018</year>
          <conf-name>EPCE '18</conf-name>
          <conf-date>July 15-20, 2018</conf-date>
          <conf-loc>Las Vegas, NV</conf-loc>
          <fpage>490</fpage>
          <lpage>500</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://link.springer.com/chapter/10.1007/978-3-319-91122-9_40"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref21">
        <label>21</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Struikman</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Bol</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Goedhart</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>van Weert</surname>
              <given-names>JC</given-names>
            </name>
            <name name-style="western">
              <surname>Talboom-Kamp</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>van Delft</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Brabers</surname>
              <given-names>AE</given-names>
            </name>
            <name name-style="western">
              <surname>van Dijk</surname>
              <given-names>L</given-names>
            </name>
          </person-group>
          <article-title>Features of a patient portal for blood test results and patient health engagement: web-based pre-post experiment</article-title>
          <source>J Med Internet Res</source>
          <year>2020</year>
          <month>07</month>
          <day>20</day>
          <volume>22</volume>
          <issue>7</issue>
          <fpage>e15798</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.jmir.org/2020/7/e15798/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/15798</pub-id>
          <pub-id pub-id-type="medline">32706704</pub-id>
          <pub-id pub-id-type="pii">v22i7e15798</pub-id>
          <pub-id pub-id-type="pmcid">PMC7399951</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref22">
        <label>22</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kopanitsa</surname>
              <given-names>G</given-names>
            </name>
          </person-group>
          <article-title>Study of patients' attitude to automatic interpretation of laboratory test results and its influence on follow-up rate</article-title>
          <source>BMC Med Inform Decis Mak</source>
          <year>2022</year>
          <month>03</month>
          <day>27</day>
          <volume>22</volume>
          <issue>1</issue>
          <fpage>79</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://bmcmedinformdecismak.biomedcentral.com/articles/10.1186/s12911-022-01805-w"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/s12911-022-01805-w</pub-id>
          <pub-id pub-id-type="medline">35346173</pub-id>
          <pub-id pub-id-type="pii">10.1186/s12911-022-01805-w</pub-id>
          <pub-id pub-id-type="pmcid">PMC8962526</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref23">
        <label>23</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zikmund-Fisher</surname>
              <given-names>BJ</given-names>
            </name>
            <name name-style="western">
              <surname>Scherer</surname>
              <given-names>AM</given-names>
            </name>
            <name name-style="western">
              <surname>Witteman</surname>
              <given-names>HO</given-names>
            </name>
            <name name-style="western">
              <surname>Solomon</surname>
              <given-names>JB</given-names>
            </name>
            <name name-style="western">
              <surname>Exe</surname>
              <given-names>NL</given-names>
            </name>
            <name name-style="western">
              <surname>Fagerlin</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Effect of harm anchors in visual displays of test results on patient perceptions of urgency about near-normal values: experimental study</article-title>
          <source>J Med Internet Res</source>
          <year>2018</year>
          <month>03</month>
          <day>26</day>
          <volume>20</volume>
          <issue>3</issue>
          <fpage>e98</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.jmir.org/2018/3/e98/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/jmir.8889</pub-id>
          <pub-id pub-id-type="medline">29581088</pub-id>
          <pub-id pub-id-type="pii">v20i3e98</pub-id>
          <pub-id pub-id-type="pmcid">PMC5891666</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref24">
        <label>24</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Morrow</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Azevedo</surname>
              <given-names>RF</given-names>
            </name>
            <name name-style="western">
              <surname>Garcia-Retamero</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Hasegawa-Johnson</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Huang</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Schuh</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Gu</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>Y</given-names>
            </name>
          </person-group>
          <article-title>Contextualizing numeric clinical test results for gist comprehension: implications for EHR patient portals</article-title>
          <source>J Exp Psychol Appl</source>
          <year>2019</year>
          <month>03</month>
          <volume>25</volume>
          <issue>1</issue>
          <fpage>41</fpage>
          <lpage>61</lpage>
          <pub-id pub-id-type="doi">10.1037/xap0000203</pub-id>
          <pub-id pub-id-type="medline">30688498</pub-id>
          <pub-id pub-id-type="pii">2019-04353-001</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref25">
        <label>25</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Tian</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Jin</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Yeganova</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Lai</surname>
              <given-names>PT</given-names>
            </name>
            <name name-style="western">
              <surname>Zhu</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Comeau</surname>
              <given-names>DC</given-names>
            </name>
            <name name-style="western">
              <surname>Islamaj</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Kapoor</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Gao</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Lu</surname>
              <given-names>Z</given-names>
            </name>
          </person-group>
          <article-title>Opportunities and challenges for ChatGPT and large language models in biomedicine and health</article-title>
          <source>Brief Bioinform</source>
          <year>2023</year>
          <month>11</month>
          <day>22</day>
          <volume>25</volume>
          <issue>1</issue>
          <fpage>bbad493</fpage>
          <pub-id pub-id-type="doi">10.1093/bib/bbad493</pub-id>
          <pub-id pub-id-type="medline">38168838</pub-id>
          <pub-id pub-id-type="pii">7505071</pub-id>
          <pub-id pub-id-type="pmcid">PMC10762511</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref26">
        <label>26</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Cadamuro</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Cabitza</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Debeljak</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>De Bruyne</surname>
              <given-names>SD</given-names>
            </name>
            <name name-style="western">
              <surname>Frans</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Perez</surname>
              <given-names>SM</given-names>
            </name>
            <name name-style="western">
              <surname>Ozdemir</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Tolios</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Carobene</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Padoan</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Potentials and pitfalls of ChatGPT and natural-language artificial intelligence models for the understanding of laboratory medicine test results. An assessment by the European Federation of Clinical Chemistry and Laboratory Medicine (EFLM) working group on artificial intelligence (WG-AI)</article-title>
          <source>Clin Chem Lab Med</source>
          <year>2023</year>
          <month>06</month>
          <day>27</day>
          <volume>61</volume>
          <issue>7</issue>
          <fpage>1158</fpage>
          <lpage>66</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.degruyter.com/document/doi/10.1515/cclm-2023-0355"/>
          </comment>
          <pub-id pub-id-type="doi">10.1515/cclm-2023-0355</pub-id>
          <pub-id pub-id-type="medline">37083166</pub-id>
          <pub-id pub-id-type="pii">cclm-2023-0355</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref27">
        <label>27</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Munoz-Zuluaga</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Zhao</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Greenblatt</surname>
              <given-names>MB</given-names>
            </name>
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>HS</given-names>
            </name>
          </person-group>
          <article-title>Assessing the accuracy and clinical utility of ChatGPT in laboratory medicine</article-title>
          <source>Clin Chem</source>
          <year>2023</year>
          <month>08</month>
          <day>02</day>
          <volume>69</volume>
          <issue>8</issue>
          <fpage>939</fpage>
          <lpage>40</lpage>
          <pub-id pub-id-type="doi">10.1093/clinchem/hvad058</pub-id>
          <pub-id pub-id-type="medline">37231970</pub-id>
          <pub-id pub-id-type="pii">7180070</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref28">
        <label>28</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Lu</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Kou</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>DT</given-names>
            </name>
            <name name-style="western">
              <surname>Huh-Yoo</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>He</surname>
              <given-names>Z</given-names>
            </name>
          </person-group>
          <article-title>Understanding patient information needs about their clinical laboratory results: a study of social Q and A site</article-title>
          <source>Stud Health Technol Inform</source>
          <year>2019</year>
          <month>08</month>
          <day>21</day>
          <volume>264</volume>
          <fpage>1403</fpage>
          <lpage>7</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/31438157"/>
          </comment>
          <pub-id pub-id-type="doi">10.3233/SHTI190458</pub-id>
          <pub-id pub-id-type="medline">31438157</pub-id>
          <pub-id pub-id-type="pii">SHTI190458</pub-id>
          <pub-id pub-id-type="pmcid">PMC6857529</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref29">
        <label>29</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Lu</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Wilson</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>He</surname>
              <given-names>Z</given-names>
            </name>
          </person-group>
          <article-title>Making sense of clinical laboratory results: an analysis of questions and replies in a social Q and A community</article-title>
          <source>Stud Health Technol Inform</source>
          <year>2019</year>
          <month>08</month>
          <day>21</day>
          <volume>264</volume>
          <fpage>2009</fpage>
          <lpage>10</lpage>
          <pub-id pub-id-type="doi">10.3233/SHTI190759</pub-id>
          <pub-id pub-id-type="medline">31438453</pub-id>
          <pub-id pub-id-type="pii">SHTI190759</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref30">
        <label>30</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kurstjens</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Schipper</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Krabbe</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Kusters</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>Predicting hemoglobinopathies using ChatGPT</article-title>
          <source>Clin Chem Lab Med</source>
          <year>2024</year>
          <month>02</month>
          <day>26</day>
          <volume>62</volume>
          <issue>3</issue>
          <fpage>e59</fpage>
          <lpage>61</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.degruyter.com/document/doi/10.1515/cclm-2023-0885/html?lang=en"/>
          </comment>
          <pub-id pub-id-type="doi">10.1515/cclm-2023-0885</pub-id>
          <pub-id pub-id-type="medline">37650428</pub-id>
          <pub-id pub-id-type="pii">cclm-2023-0885</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref31">
        <label>31</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>He</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Tian</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Erdengasileng</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Hanna</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Gong</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Luo</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Lustria</surname>
              <given-names>ML</given-names>
            </name>
          </person-group>
          <article-title>Annotation and information extraction of consumer-friendly health articles for enhancing laboratory test reporting</article-title>
          <source>AMIA Annu Symp Proc</source>
          <year>2023</year>
          <volume>2023</volume>
          <fpage>407</fpage>
          <lpage>16</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/38222337"/>
          </comment>
          <pub-id pub-id-type="medline">38222337</pub-id>
          <pub-id pub-id-type="pii">492</pub-id>
          <pub-id pub-id-type="pmcid">PMC10785897</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref32">
        <label>32</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Yoon</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>So</surname>
              <given-names>CH</given-names>
            </name>
            <name name-style="western">
              <surname>Kang</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>BioBERT: a pre-trained biomedical language representation model for biomedical text mining</article-title>
          <source>Bioinformatics</source>
          <year>2020</year>
          <month>02</month>
          <day>15</day>
          <volume>36</volume>
          <issue>4</issue>
          <fpage>1234</fpage>
          <lpage>40</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/31501885"/>
          </comment>
          <pub-id pub-id-type="doi">10.1093/bioinformatics/btz682</pub-id>
          <pub-id pub-id-type="medline">31501885</pub-id>
          <pub-id pub-id-type="pii">5566506</pub-id>
          <pub-id pub-id-type="pmcid">PMC7703786</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref33">
        <label>33</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Alsentzer</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Murphy</surname>
              <given-names>JR</given-names>
            </name>
            <name name-style="western">
              <surname>Boag</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Weng</surname>
              <given-names>WH</given-names>
            </name>
            <name name-style="western">
              <surname>Jin</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Naumann</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>McDermott</surname>
              <given-names>MB</given-names>
            </name>
          </person-group>
          <article-title>Publicly available clinical BERT embeddings</article-title>
          <source>arXiv</source>
          <comment>Preprint posted online on April 6, 2019</comment>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://arxiv.org/abs/1904.03323"/>
          </comment>
          <pub-id pub-id-type="doi">10.18653/v1/w19-1909</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref34">
        <label>34</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Beltagy</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Lo</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Cohan</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>SciBERT: a pretrained language model for scientific text</article-title>
          <source>arXiv</source>
          <comment>Preprint posted online on March 26, 2019</comment>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://arxiv.org/abs/1903.10676"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref35">
        <label>35</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Gu</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Tinn</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Cheng</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Lucas</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Usuyama</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Naumann</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Gao</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Poon</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>Domain-specific language model pretraining for biomedical natural language processing</article-title>
          <source>ACM Trans Comput Healthc</source>
          <year>2021</year>
          <month>10</month>
          <day>15</day>
          <volume>3</volume>
          <issue>1</issue>
          <fpage>1</fpage>
          <lpage>23</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://dl.acm.org/doi/10.1145/3458754"/>
          </comment>
          <pub-id pub-id-type="doi">10.1145/3458754</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref36">
        <label>36</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <collab>OpenAI</collab>
          </person-group>
          <article-title>GPT-4 technical report</article-title>
          <source>arXiv</source>
          <comment>Preprint posted online on March 15, 2023</comment>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://arxiv.org/abs/2303.08774"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref37">
        <label>37</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ye</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Xu</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Zu</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Shao</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Cui</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Zhou</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Gong</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Shen</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Zhou</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Gui</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Huang</surname>
              <given-names>X</given-names>
            </name>
          </person-group>
          <article-title>A comprehensive capability analysis of GPT-3 and GPT-3</article-title>
          <source>arXiv</source>
          <comment>Preprint posted online on March 18, 2023</comment>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://arxiv.org/abs/2303.10420"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref38">
        <label>38</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Touvron</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Martin</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Stone</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Albert</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Almahairi</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Babaei</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Bashlykov</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Batra</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Bhargava</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Bhosale</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Bikel</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Blecher</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Ferrer</surname>
              <given-names>CC</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Cucurull</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Narang</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Rodriguez</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Stojnic</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Edunov</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Scialom</surname>
              <given-names>T</given-names>
            </name>
          </person-group>
          <article-title>Llama 2: open foundation and fine-tuned chat models</article-title>
          <source>arXiv</source>
          <comment>Preprint posted online on July 18, 2023</comment>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://arxiv.org/abs/2307.09288"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref39">
        <label>39</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Han</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Adams</surname>
              <given-names>LC</given-names>
            </name>
            <name name-style="western">
              <surname>Papaioannou</surname>
              <given-names>JM</given-names>
            </name>
            <name name-style="western">
              <surname>Grundmann</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Oberhauser</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Löser</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Truhn</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Bressem</surname>
              <given-names>KK</given-names>
            </name>
          </person-group>
          <article-title>MedAlpaca -- an open-source collection of medical conversational AI models and training data</article-title>
          <source>arXiv</source>
          <comment>Preprint posted online on April 14, 2023</comment>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://arxiv.org/abs/2304.08247"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref40">
        <label>40</label>
        <nlm-citation citation-type="web">
          <article-title>orca_mini_3b</article-title>
          <source>Hugging Face</source>
          <access-date>2023-12-04</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://huggingface.co/pankajmathur/orca_mini_3b">https://huggingface.co/pankajmathur/orca_mini_3b</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref41">
        <label>41</label>
        <nlm-citation citation-type="web">
          <article-title>LangChain: introduction and getting started</article-title>
          <source>Pinecone</source>
          <access-date>2023-12-04</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.pinecone.io/learn/series/langchain/langchain-intro/">https://www.pinecone.io/learn/series/langchain/langchain-intro/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref42">
        <label>42</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Papineni</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Roukos</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Ward</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Zhu</surname>
              <given-names>WJ</given-names>
            </name>
          </person-group>
          <article-title>BLEU: a method for automatic evaluation of machine translation</article-title>
          <source>Proceedings of the 40th Annual Meeting on Association for Computational Linguistics</source>
          <year>2002</year>
          <conf-name>ALC '02</conf-name>
          <conf-date>July 7-12, 2002</conf-date>
          <conf-loc>Philadelphia, PA</conf-loc>
          <fpage>311</fpage>
          <lpage>8</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://dl.acm.org/doi/10.3115/1073083.1073135"/>
          </comment>
          <pub-id pub-id-type="doi">10.3115/1073083.1073135</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref43">
        <label>43</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Post</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>A call for clarity in reporting BLEU scores</article-title>
          <source>arXiv</source>
          <comment>Preprint posted online on April 23, 2018</comment>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://arxiv.org/abs/1804.08771"/>
          </comment>
          <pub-id pub-id-type="doi">10.18653/v1/w18-6319</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref44">
        <label>44</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Banerjee</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Lavie</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>METEOR: an automatic metric for MT evaluation with high levels of correlation with human judgments</article-title>
          <source>Proceedings of the 2005 ACL Workshop on Intrinsic and Extrinsic Evaluation Measures for Machine Translation and/or Summarization</source>
          <year>2005</year>
          <conf-name>WIEEMMTS '05</conf-name>
          <conf-date>June 29, 2005</conf-date>
          <conf-loc>Ann Arbor, MI</conf-loc>
          <fpage>65</fpage>
          <lpage>72</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://aclanthology.org/W05-0909"/>
          </comment>
          <pub-id pub-id-type="doi">10.3115/1626355.1626389</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref45">
        <label>45</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lin</surname>
              <given-names>CY</given-names>
            </name>
          </person-group>
          <person-group person-group-type="editor">
            <name name-style="western">
              <surname>Lin</surname>
              <given-names>CY</given-names>
            </name>
          </person-group>
          <article-title>ROUGE: a package for automatic evaluation of summaries</article-title>
          <source>Text Summarization Branches Out Internet</source>
          <year>2004</year>
          <publisher-loc>Barcelona, Spain</publisher-loc>
          <publisher-name>Association for Computational Linguistics</publisher-name>
          <fpage>74</fpage>
          <lpage>81</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref46">
        <label>46</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Kishore</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Weinberger</surname>
              <given-names>KQ</given-names>
            </name>
            <name name-style="western">
              <surname>Artzi</surname>
              <given-names>Y</given-names>
            </name>
          </person-group>
          <article-title>BERTScore: evaluating text generation with BERT</article-title>
          <source>arXiv</source>
          <comment>Preprint posted online on April 21, 2019</comment>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://arxiv.org/abs/1904.09675"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref47">
        <label>47</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Yu</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Tan</surname>
              <given-names>XE</given-names>
            </name>
            <name name-style="western">
              <surname>O'Brien</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Pasunuru</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Dwivedi-Yu</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Golovneva</surname>
              <given-names>O</given-names>
            </name>
            <name name-style="western">
              <surname>Zettlemoyer</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Fazel-Zarandi</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Celikyilmaz</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Shepherd: a critic for language model generation</article-title>
          <source>arXiv</source>
          <comment>Preprint posted online on August 8, 2023</comment>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://arxiv.org/abs/2308.04592"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref48">
        <label>48</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Dubois</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Taori</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Gulrajani</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Ba</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Guestrin</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Liang</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Hashimoto</surname>
              <given-names>TB</given-names>
            </name>
          </person-group>
          <article-title>AlpacaFarm: a simulation framework for methods that learn from human feedback</article-title>
          <source>arXiv</source>
          <comment>Preprint posted online on May 22, 2023</comment>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://arxiv.org/abs/2305.14387"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref49">
        <label>49</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Bartko</surname>
              <given-names>JJ</given-names>
            </name>
          </person-group>
          <article-title>The intraclass correlation coefficient as a measure of reliability</article-title>
          <source>Psychol Rep</source>
          <year>1966</year>
          <month>08</month>
          <volume>19</volume>
          <issue>1</issue>
          <fpage>3</fpage>
          <lpage>11</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://journals.sagepub.com/doi/10.2466/pr0.1966.19.1.3"/>
          </comment>
          <pub-id pub-id-type="doi">10.2466/pr0.1966.19.1.3</pub-id>
          <pub-id pub-id-type="medline">5942109</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref50">
        <label>50</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Gamer</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Lemon</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Singh</surname>
              <given-names>IF</given-names>
            </name>
          </person-group>
          <article-title>irr: various coefficients of interrater reliability and agreement</article-title>
          <source>Cran R Project</source>
          <year>2019</year>
          <access-date>2023-12-12</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://cran.r-project.org/web/packages/irr/index.html">https://cran.r-project.org/web/packages/irr/index.html</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref51">
        <label>51</label>
        <nlm-citation citation-type="web">
          <article-title>Human subject regulations decision charts: 2018 requirements</article-title>
          <source>Office for Human Research Protection</source>
          <year>2019</year>
          <month>1</month>
          <day>20</day>
          <access-date>2024-04-03</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.research.fsu.edu/media/5857/human-subject-regulations-decision-charts-2018-requirements.pdf">https://tinyurl.com/3sbzydm3</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref52">
        <label>52</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Jin</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Leaman</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Lu</surname>
              <given-names>Z</given-names>
            </name>
          </person-group>
          <article-title>Retrieve, summarize, and verify: how will ChatGPT affect information seeking from the medical literature?</article-title>
          <source>J Am Soc Nephrol</source>
          <year>2023</year>
          <month>08</month>
          <day>01</day>
          <volume>34</volume>
          <issue>8</issue>
          <fpage>1302</fpage>
          <lpage>4</lpage>
          <pub-id pub-id-type="doi">10.1681/ASN.0000000000000166</pub-id>
          <pub-id pub-id-type="medline">37254254</pub-id>
          <pub-id pub-id-type="pii">00001751-202308000-00004</pub-id>
          <pub-id pub-id-type="pmcid">PMC10400098</pub-id>
        </nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
