<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article" dtd-version="2.0">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JMIR</journal-id>
      <journal-id journal-id-type="nlm-ta">J Med Internet Res</journal-id>
      <journal-title>Journal of Medical Internet Research</journal-title>
      <issn pub-type="epub">1438-8871</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
      <article-id pub-id-type="publisher-id">v27i1e70481</article-id>
      <article-id pub-id-type="pmid">40100270</article-id>
      <article-id pub-id-type="doi">10.2196/70481</article-id>
      <article-categories>
        <subj-group subj-group-type="heading">
          <subject>Tutorial</subject>
        </subj-group>
        <subj-group subj-group-type="article-type">
          <subject>Tutorial</subject>
        </subj-group>
      </article-categories>
      <title-group>
        <article-title>How to Design, Create, and Evaluate an Instruction-Tuning Dataset for Large Language Model Training in Health Care: Tutorial From a Clinical Perspective</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="editor">
          <name>
            <surname>Mavragani</surname>
            <given-names>Amaryllis</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Faria</surname>
            <given-names>Fatema Tuj Johora</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Gorrepati</surname>
            <given-names>Leela Prasad</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib id="contrib1" contrib-type="author" corresp="yes">
          <name name-style="western">
            <surname>Nazar</surname>
            <given-names>Wojciech</given-names>
          </name>
          <degrees>MD, PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <address>
            <institution>Department of Allergology</institution>
            <institution>Faculty of Medicine</institution>
            <institution>Gdańsk Medical University</institution>
            <addr-line>Smoluchowskiego 17</addr-line>
            <addr-line>Gdansk, 80-214</addr-line>
            <country>Poland</country>
            <phone>48 585844300</phone>
            <email>wojciech.nazar@gumed.edu.pl</email>
          </address>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-8448-0800</ext-link>
        </contrib>
        <contrib id="contrib2" contrib-type="author">
          <name name-style="western">
            <surname>Nazar</surname>
            <given-names>Grzegorz</given-names>
          </name>
          <xref rid="aff2" ref-type="aff">2</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0009-0006-2418-1282</ext-link>
        </contrib>
        <contrib id="contrib3" contrib-type="author">
          <name name-style="western">
            <surname>Kamińska</surname>
            <given-names>Aleksandra</given-names>
          </name>
          <xref rid="aff2" ref-type="aff">2</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0009-0002-8268-3804</ext-link>
        </contrib>
        <contrib id="contrib4" contrib-type="author">
          <name name-style="western">
            <surname>Danilowicz-Szymanowicz</surname>
            <given-names>Ludmila</given-names>
          </name>
          <degrees>MD, PhD</degrees>
          <xref rid="aff3" ref-type="aff">3</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-2269-1880</ext-link>
        </contrib>
      </contrib-group>
      <aff id="aff1">
        <label>1</label>
        <institution>Department of Allergology</institution>
        <institution>Faculty of Medicine</institution>
        <institution>Gdańsk Medical University</institution>
        <addr-line>Gdansk</addr-line>
        <country>Poland</country>
      </aff>
      <aff id="aff2">
        <label>2</label>
        <institution>Faculty of Medicine</institution>
        <institution>Gdańsk Medical University</institution>
        <addr-line>Gdansk</addr-line>
        <country>Poland</country>
      </aff>
      <aff id="aff3">
        <label>3</label>
        <institution>Department of Cardiology and Electrotherapy</institution>
        <institution>Faculty of Medicine</institution>
        <institution>Gdańsk Medical University</institution>
        <addr-line>Gdansk</addr-line>
        <country>Poland</country>
      </aff>
      <author-notes>
        <corresp>Corresponding Author: Wojciech Nazar <email>wojciech.nazar@gumed.edu.pl</email></corresp>
      </author-notes>
      <pub-date pub-type="collection">
        <year>2025</year>
      </pub-date>
      <pub-date pub-type="epub">
        <day>18</day>
        <month>3</month>
        <year>2025</year>
      </pub-date>
      <volume>27</volume>
      <elocation-id>e70481</elocation-id>
      <history>
        <date date-type="received">
          <day>23</day>
          <month>12</month>
          <year>2024</year>
        </date>
        <date date-type="rev-request">
          <day>20</day>
          <month>1</month>
          <year>2025</year>
        </date>
        <date date-type="rev-recd">
          <day>28</day>
          <month>1</month>
          <year>2025</year>
        </date>
        <date date-type="accepted">
          <day>7</day>
          <month>2</month>
          <year>2025</year>
        </date>
      </history>
      <copyright-statement>©Wojciech Nazar, Grzegorz Nazar, Aleksandra Kamińska, Ludmila Danilowicz-Szymanowicz. Originally published in the Journal of Medical Internet Research (https://www.jmir.org), 18.03.2025.</copyright-statement>
      <copyright-year>2025</copyright-year>
      <license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
        <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (https://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in the Journal of Medical Internet Research (ISSN 1438-8871), is properly cited. The complete bibliographic information, a link to the original publication on https://www.jmir.org/, as well as this copyright and license information must be included.</p>
      </license>
      <self-uri xlink:href="https://www.jmir.org/2025/1/e70481" xlink:type="simple"/>
      <abstract>
        <p>High-quality data are critical in health care, forming the cornerstone for accurate diagnoses, effective treatment plans, and reliable conclusions. Similarly, high-quality datasets underpin the development and performance of large language models (LLMs). Among these, instruction-tuning datasets (ITDs) used for instruction fine-tuning have been pivotal in enhancing LLM performance and generalization capabilities across diverse tasks. This tutorial provides a comprehensive guide to designing, creating, and evaluating ITDs for health care applications. Written from a clinical perspective, it aims to make the concepts accessible to a broad audience, especially medical practitioners. Key topics include identifying useful data sources, defining the characteristics of well-designed datasets, and crafting high-quality instruction-input-output examples. We explore practical approaches to dataset construction, examining the advantages and limitations of 3 primary methods: fully manual preparation by expert annotators, fully synthetic generation using artificial intelligence (AI), and an innovative hybrid approach in which experts draft the initial dataset and AI generates additional data. Moreover, we discuss strategies for metadata selection and human evaluation to ensure the quality and effectiveness of ITDs. By integrating these elements, this tutorial provides a structured framework for establishing ITDs. It bridges technical and clinical domains, supporting the continued interdisciplinary advancement of AI in medicine. Additionally, we address the limitations of current practices and propose future directions, emphasizing the need for a global, unified framework for ITDs. We also argue that artificial general intelligence (AGI), if realized, will not replace empirical research in medicine. AGI will depend on human-curated datasets to process and apply medical knowledge. At the same time, ITDs will likely remain the most effective method of supplying this knowledge to AGI, positioning them as a critical tool in AI-driven health care.</p>
      </abstract>
      <kwd-group>
        <kwd>generative artificial intelligence</kwd>
        <kwd>large language models</kwd>
        <kwd>instruction-tuning datasets</kwd>
        <kwd>tutorials</kwd>
        <kwd>evaluation framework</kwd>
        <kwd>health care</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <sec sec-type="introduction">
      <title>Introduction</title>
      <sec>
        <title>Background</title>
        <p>Why is high-quality data the cornerstone of modern artificial intelligence (AI)-driven health care? Reliable data enable AI algorithms to assist medical professionals in making evidence-based decisions, reducing the likelihood of errors and improving patient outcomes [<xref ref-type="bibr" rid="ref1">1</xref>-<xref ref-type="bibr" rid="ref5">5</xref>].</p>
        <p>Accurate datasets are the foundation for developing large language models (LLMs) and deep learning models based on the transformer architecture [<xref ref-type="bibr" rid="ref1">1</xref>,<xref ref-type="bibr" rid="ref6">6</xref>-<xref ref-type="bibr" rid="ref12">12</xref>]. This technique enables the model to learn the structure, grammar, and nuances of language, as well as factual knowledge and patterns of reasoning [<xref ref-type="bibr" rid="ref9">9</xref>-<xref ref-type="bibr" rid="ref12">12</xref>]. Examples of state-of-the-art all-purpose transformer-based models include generative pretrained transformers (GPTs) from OpenAI, Gemini and Gemma models from Google DeepMind, and LLaMA LLMs from Meta [<xref ref-type="bibr" rid="ref8">8</xref>,<xref ref-type="bibr" rid="ref13">13</xref>,<xref ref-type="bibr" rid="ref14">14</xref>].</p>
        <p>Foundation LLMs are trained on vast amounts of data encompassing millions of samples from diverse sources such as books, studies, and websites [<xref ref-type="bibr" rid="ref8">8</xref>,<xref ref-type="bibr" rid="ref13">13</xref>,<xref ref-type="bibr" rid="ref14">14</xref>]. The goal is to improve the model’s generalization capabilities. This means enabling the model to apply learned patterns and knowledge to a wide range of unseen inputs, ensuring it produces accurate, meaningful, and contextually relevant outputs rather than merely memorizing the training data [<xref ref-type="bibr" rid="ref8">8</xref>,<xref ref-type="bibr" rid="ref13">13</xref>,<xref ref-type="bibr" rid="ref14">14</xref>]. Among various types of datasets used to train LLMs, instruction-tuning datasets (ITDs) used for instruction fine-tuning (IFT) has emerged as a pivotal technique in enhancing LLM performance and generalization capabilities across diverse tasks [<xref ref-type="bibr" rid="ref8">8</xref>,<xref ref-type="bibr" rid="ref9">9</xref>,<xref ref-type="bibr" rid="ref15">15</xref>]. For example, OpenAI researchers report that, even though the InstructGPT model with 1.3 billion parameters has more than 100 times fewer parameters than the original 175 billion-parameter GPT-3, its outputs were preferred over GPT-3 [<xref ref-type="bibr" rid="ref8">8</xref>]. An ITD contains examples of task instructions paired with corresponding responses, enabling models to understand better and follow human-like directives [<xref ref-type="bibr" rid="ref8">8</xref>,<xref ref-type="bibr" rid="ref9">9</xref>,<xref ref-type="bibr" rid="ref16">16</xref>]. This structured training approach not only improves task-specific accuracy but also enhances the LLM’s ability to generalize knowledge across multiple domains, sometimes including even domains that were not extensively covered in the original dataset [<xref ref-type="bibr" rid="ref8">8</xref>,<xref ref-type="bibr" rid="ref9">9</xref>,<xref ref-type="bibr" rid="ref15">15</xref>,<xref ref-type="bibr" rid="ref16">16</xref>].</p>
        <p>Given the importance of ITDs, preparing high-quality ITDs in health care is critical. Such datasets may facilitate the formation of robust LLMs capable of addressing the nuanced requirements of complex medical questions, where precision, adaptability, and context-specific understanding are essential. In the clinical setting, instruction-tuned LLMs may support clinical decision-making and reduce the risk of medical errors. It will ultimately improve clinically relevant outcomes like increased patient safety and reduced hospitalizations, intensive care unit admissions, or deaths [<xref ref-type="bibr" rid="ref4">4</xref>,<xref ref-type="bibr" rid="ref5">5</xref>,<xref ref-type="bibr" rid="ref17">17</xref>-<xref ref-type="bibr" rid="ref20">20</xref>].</p>
      </sec>
      <sec>
        <title>Aim</title>
        <p>This paper provides a guide on the key principles of designing, creating, and evaluating ITDs for training LLMs in health care applications.</p>
      </sec>
      <sec>
        <title>What Is an ITD?</title>
        <p>ITDs are used during the IFT of LLMs [<xref ref-type="bibr" rid="ref8">8</xref>,<xref ref-type="bibr" rid="ref9">9</xref>,<xref ref-type="bibr" rid="ref12">12</xref>,<xref ref-type="bibr" rid="ref21">21</xref>]. These datasets typically consist of instruction-input-output (IIO) triples, such as an appropriate instruction combined with a question and its corresponding answer [<xref ref-type="bibr" rid="ref8">8</xref>,<xref ref-type="bibr" rid="ref9">9</xref>,<xref ref-type="bibr" rid="ref21">21</xref>]. The primary aim of IFT is to improve a model’s ability to comprehend and execute various instructions, particularly those relevant to the medical domain, ultimately developing a specialized medical LLM [<xref ref-type="bibr" rid="ref12">12</xref>,<xref ref-type="bibr" rid="ref21">21</xref>,<xref ref-type="bibr" rid="ref22">22</xref>].</p>
        <p>ITDs differ significantly from the general datasets used for supervised fine-tuning (SFT). The key distinction between databases used for SFT and IFT lies in their objectives and methodologies. SFT primarily seeks to integrate domain-specific medical knowledge into a general LLM by continuing pretraining, enhancing the model’s understanding of medical texts [<xref ref-type="bibr" rid="ref8">8</xref>,<xref ref-type="bibr" rid="ref12">12</xref>]. It creates a “medical foundation model” [<xref ref-type="bibr" rid="ref12">12</xref>,<xref ref-type="bibr" rid="ref21">21</xref>]. Conversely, IFT focuses on improving the model’s responsiveness to instructions and tailoring its outputs to align with specific guidance and human-like responses rather than emphasizing token prediction accuracy as in ST [<xref ref-type="bibr" rid="ref12">12</xref>,<xref ref-type="bibr" rid="ref21">21</xref>]. Usually, the IFT follows the SFT [<xref ref-type="bibr" rid="ref8">8</xref>,<xref ref-type="bibr" rid="ref9">9</xref>,<xref ref-type="bibr" rid="ref12">12</xref>,<xref ref-type="bibr" rid="ref21">21</xref>].</p>
        <p>While SFT relies heavily on the volume of training data, IFT prioritizes the quality and diversity of the data. In general, IFT improves the performance of the baseline SFT model [<xref ref-type="bibr" rid="ref7">7</xref>,<xref ref-type="bibr" rid="ref16">16</xref>,<xref ref-type="bibr" rid="ref21">21</xref>,<xref ref-type="bibr" rid="ref22">22</xref>]. Recent research has explored combining these approaches to build robust medical LLMs, leveraging the strengths of both techniques for better overall accuracy [<xref ref-type="bibr" rid="ref7">7</xref>,<xref ref-type="bibr" rid="ref16">16</xref>,<xref ref-type="bibr" rid="ref21">21</xref>,<xref ref-type="bibr" rid="ref22">22</xref>].</p>
      </sec>
    </sec>
    <sec>
      <title>Considerations for Designing ITDs in Health Care</title>
      <sec>
        <title>Data Sources</title>
        <p>Instruction-tuning in health care requires diverse and high-quality datasets to train LLMs effectively. Potential sources for such data are presented in <xref rid="figure1" ref-type="fig">Figure 1</xref> and described in detail in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p>
        <fig id="figure1" position="float">
          <label>Figure 1</label>
          <caption>
            <p>Potential data sources for instruction-tuning datasets. LLM: large language model.</p>
          </caption>
          <graphic xlink:href="jmir_v27i1e70481_fig1.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <sec>
          <title>Key Features of a Well-Designed ITD</title>
          <p>The dataset should align with the specific objectives and use cases for which the model is fine-tuned, whether clinical decision support, patient education, or administrative tasks (<xref rid="figure2" ref-type="fig">Figure 2</xref>) [<xref ref-type="bibr" rid="ref11">11</xref>,<xref ref-type="bibr" rid="ref21">21</xref>]. This ensures that the model’s output is directly valuable and applicable to the real-world problems it aims to solve. Further on, the dataset should incorporate diverse and realistic scenarios, such as doctor-patient conversations, clinician-to-clinician notes, and patient health records [<xref ref-type="bibr" rid="ref11">11</xref>,<xref ref-type="bibr" rid="ref21">21</xref>]. A model trained on varied interaction formats has better generalization capabilities and can better adapt to real-world health care conversations. The samples should be clearly and consistently annotated. They should also reflect as much human diversity and medical knowledge as possible, including data from a representative population considering demographic factors like age, gender, ethnicity, socioeconomic status, and geographic location [<xref ref-type="bibr" rid="ref11">11</xref>,<xref ref-type="bibr" rid="ref21">21</xref>,<xref ref-type="bibr" rid="ref22">22</xref>]. Moreover, incorporating evidence-based medical information is crucial for patient safety [<xref ref-type="bibr" rid="ref11">11</xref>,<xref ref-type="bibr" rid="ref21">21</xref>,<xref ref-type="bibr" rid="ref22">22</xref>]. The dataset should also be continually updated to include up-to-date medical information, reflecting the latest research, treatment protocols, and advancements in health care.</p>
          <p>Additionally, to ensure patient confidentiality, personal information, such as names and addresses, must be removed from the training dataset and anonymized [<xref ref-type="bibr" rid="ref8">8</xref>,<xref ref-type="bibr" rid="ref21">21</xref>]. Examples of regulatory acts that address this issue include the Health Insurance Portability and Accountability Act in the United States and the General Data Protection Regulation in the European Union [<xref ref-type="bibr" rid="ref23">23</xref>,<xref ref-type="bibr" rid="ref24">24</xref>]. Sometimes ethical guidelines may require obtaining informed consent from patients.</p>
          <p>Furthermore, the format of the IIO triples in the dataset should be compatible with the accepted input-output format of the LLM model, which will be fine-tuned. For example, the GPT family from OpenAI or Microsoft accepts prompts formatted using the Chat Markup Language [<xref ref-type="bibr" rid="ref8">8</xref>,<xref ref-type="bibr" rid="ref25">25</xref>].</p>
          <fig id="figure2" position="float">
            <label>Figure 2</label>
            <caption>
              <p>Features of a well-designed dataset. LLM: large language model.</p>
            </caption>
            <graphic xlink:href="jmir_v27i1e70481_fig2.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </fig>
        </sec>
        <sec>
          <title>IIO Examples</title>
          <p>In the original paper, to create instruction-input-output sets to fine-tune the family of InstructGPT models Ouyang et al [<xref ref-type="bibr" rid="ref8">8</xref>] identified 9 general response types. Starting from the most popular, the dataset included the following scenarios: generation, open question-answering, brainstorming, chat, rewrite, summarization, classification, closed question-answering, and extract (<xref rid="figure3" ref-type="fig">Figure 3</xref>). <xref ref-type="table" rid="table1">Table 1</xref> presents the corresponding examples in the medical domain based on these categories and the original prompt samples. Additional examples are provided in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p>
          <fig id="figure3" position="float">
            <label>Figure 3</label>
            <caption>
              <p>Generative AI response types. AI: artificial intelligence.</p>
            </caption>
            <graphic xlink:href="jmir_v27i1e70481_fig3.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </fig>
          <table-wrap position="float" id="table1">
            <label>Table 1</label>
            <caption>
              <p>Prompt categories with corresponding examples from the medical domain.</p>
            </caption>
            <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
              <col width="150"/>
              <col width="230"/>
              <col width="330"/>
              <col width="290"/>
              <thead>
                <tr valign="top">
                  <td>Category</td>
                  <td>Instruction</td>
                  <td>Input</td>
                  <td>Output</td>
                </tr>
              </thead>
              <tbody>
                <tr valign="top">
                  <td>Generation</td>
                  <td>Generate a patient-friendly explanation of the relationship between high blood pressure and heart disease.</td>
                  <td>Explain how high blood pressure increases the risk of heart conditions such as heart attack and heart failure. Use simple language and include suggestions for lifestyle changes.</td>
                  <td>High blood pressure, also called hypertension, makes your heart work harder than normal. Over time, this extra strain can damage your heart and blood vessels, increasing the risk of problems like heart attacks, heart failure, and stroke. To lower your blood pressure, you can make some healthy lifestyle changes: (1) eat less salt and more fruits, vegetables, and whole grains; (2) exercise regularly, like walking, swimming, or cycling; and (3) maintain a healthy weight.</td>
                </tr>
                <tr valign="top">
                  <td>Open question-answering</td>
                  <td>Answer the question</td>
                  <td>What is the definition of the left ventricular hypertrophy? How is left ventricular hypertrophy typically diagnosed?</td>
                  <td>Left ventricular hypertrophy refers to the thickening of the walls of the heart’s left ventricle, often due to increased workload or underlying conditions. Diagnosis typically involves several steps. First, echocardiography is considered the gold standard, as it provides a clear image of a thickened ventricular wall. Second, an electrocardiogram can reveal abnormalities that suggest LVH<sup>a</sup>. Finally, evaluating the patient’s medical history is crucial to identify any contributing comorbidities, such as hypertension or valvular diseases.</td>
                </tr>
                <tr valign="top">
                  <td>Brainstorming</td>
                  <td>Brainstorm noninvasive methods for early detection of cardiovascular diseases</td>
                  <td>Focus on imaging, wearables, biomarkers, and AI<sup>b</sup> technologies.</td>
                  <td>Wearable devices for continuous ECG<sup>c</sup> monitoring as well as photoplethysmography. AI-enhanced portable ultrasound for cardiac imaging.</td>
                </tr>
                <tr valign="top">
                  <td>Chat</td>
                  <td>A patient with hypertension and a history of heart disease asks: “What lifestyle changes can I make to improve my heart health?” Respond with clear and concise evidence-based advice.</td>
                  <td>The patient is a 55-year-old male with a history of myocardial infarction 3 years ago and a diagnosis of hypertension. He is currently on beta-blockers and ACE inhibitors. His BMI is 28, and his lipid profile shows elevated LDL<sup>d</sup> cholesterol. He works a sedentary desk job and smokes.</td>
                  <td>Quitting smoking. Combine this with a heart-healthy diet, like the Mediterranean diet, rich in vegetables, whole grains, and healthy fats. Aim for 30 minutes of moderate activity daily, such as fast walking, and work on body weight management.</td>
                </tr>
                <tr valign="top">
                  <td>Rewrite</td>
                  <td>Rewrite the following text more concisely while retaining all key information.</td>
                  <td>“Cardiovascular disease is the leading cause of mortality worldwide and encompasses a range of conditions such as coronary artery disease, heart failure, arrhythmias, and valvular heart disorders. Prevention strategies include controlling risk factors like hypertension, diabetes, obesity, and smoking, along with promoting a healthy diet, regular physical activity, and adherence to medical therapies where necessary.”</td>
                  <td>“Cardiovascular disease, the leading global cause of death, includes conditions like coronary artery disease, heart failure, arrhythmias, and valvular disorders. Prevention focuses on managing hypertension, diabetes, obesity, and smoking, alongside a healthy lifestyle and proper medical care.”</td>
                </tr>
                <tr valign="top">
                  <td>Summarize</td>
                  <td>Summarize the role of beta-blockers in the management of heart failure for a second-grade student</td>
                  <td>Beta-blockers are medicines that help the heart rest and work better. They slow down the heartbeat, so the heart does not have to work as hard. Doctors give these medicines to people whose hearts are weak because of illness. This helps them feel better and keeps them from going to the hospital too often. Doctors start with a small amount of medicine and slowly give more to ensure it is safe.</td>
                  <td>Beta-blockers are medicines that help a weak heart rest and get stronger. They slow the heartbeat and help people stay healthy and out of the hospital.</td>
                </tr>
                <tr valign="top">
                  <td>Classification</td>
                  <td>Classify the following drugs based on their pharmacological categories.</td>
                  <td>Carvedilol and rivaroxaban</td>
                  <td>
                    <list list-type="bullet">
                      <list-item>
                        <p>Carvedilol: beta-blocker</p>
                      </list-item>
                      <list-item>
                        <p>Rivaroxaban: antithrombic</p>
                      </list-item>
                    </list>
                  </td>
                </tr>
                <tr valign="top">
                  <td>Closed question-answering</td>
                  <td>Answer the following closed question based on your knowledge of cardiovascular medicine.</td>
                  <td>Is metoprolol a beta-blocker or antiplatelet drug?</td>
                  <td>Beta-blocker</td>
                </tr>
                <tr valign="top">
                  <td>Extract</td>
                  <td>Extract the diagnoses of the patient from the electronic health records.</td>
                  <td>History: Hypertension, hyperlipidemia, past MI<sup>e</sup>, chest pain on exertion, elevated troponin, stress test: moderate CAD<sup>f</sup> risk.</td>
                  <td>Hypertension, hyperlipidemia, MI, and CAD (moderate risk)</td>
                </tr>
              </tbody>
            </table>
            <table-wrap-foot>
              <fn id="table1fn1">
                <p><sup>a</sup>LVH: left ventricular hypertrophy.</p>
              </fn>
              <fn id="table1fn2">
                <p><sup>b</sup>AI: artificial intelligence.</p>
              </fn>
              <fn id="table1fn3">
                <p><sup>c</sup>ECG: electrocardiogram.</p>
              </fn>
              <fn id="table1fn4">
                <p><sup>d</sup>LDL: low-density lipoprotein.</p>
              </fn>
              <fn id="table1fn5">
                <p><sup>e</sup>MI: myocardial infarction.</p>
              </fn>
              <fn id="table1fn6">
                <p><sup>f</sup>CAD: coronary artery disease.</p>
              </fn>
            </table-wrap-foot>
          </table-wrap>
        </sec>
      </sec>
      <sec>
        <title>Approaches to Dataset Creation</title>
        <p>There are 3 main approaches to preparing an ITD (<xref rid="figure4" ref-type="fig">Figure 4</xref>) [<xref ref-type="bibr" rid="ref7">7</xref>-<xref ref-type="bibr" rid="ref9">9</xref>,<xref ref-type="bibr" rid="ref15">15</xref>,<xref ref-type="bibr" rid="ref16">16</xref>,<xref ref-type="bibr" rid="ref21">21</xref>].</p>
        <fig id="figure4" position="float">
          <label>Figure 4</label>
          <caption>
            <p>Approaches to dataset creation. AI: artificial intelligence; LLM: large language model.</p>
          </caption>
          <graphic xlink:href="jmir_v27i1e70481_fig4.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <sec>
          <title>Human-Created</title>
          <p>The dataset is prepared manually and entirely by expert human annotators [<xref ref-type="bibr" rid="ref8">8</xref>,<xref ref-type="bibr" rid="ref15">15</xref>,<xref ref-type="bibr" rid="ref16">16</xref>,<xref ref-type="bibr" rid="ref21">21</xref>]. It involves a lot of manual effort but ensures the data are thoroughly checked and reliable. Examples include MedQA, MedMCQA, PubMedQA, or HealthSearchQA databases [<xref ref-type="bibr" rid="ref21">21</xref>,<xref ref-type="bibr" rid="ref26">26</xref>-<xref ref-type="bibr" rid="ref29">29</xref>].</p>
        </sec>
        <sec>
          <title>AI-Generated</title>
          <p>The dataset is entirely generated by AI, for example, LLMs. Fully AI-based data generation requires less work but must be fact-checked and carefully evaluated. Frameworks for human evaluation of the LLM-generated content must be used to effectively assess the quality of generated data and check for potentially harmful content [<xref ref-type="bibr" rid="ref6">6</xref>,<xref ref-type="bibr" rid="ref8">8</xref>,<xref ref-type="bibr" rid="ref21">21</xref>].</p>
          <p>Automated AI generation was applied by Wu et al [<xref ref-type="bibr" rid="ref15">15</xref>], who built a 400,000 instruction-following dataset, “MIMIC-Instr,” from the publicly available MIMIC-IV electronic health record (her) database [<xref ref-type="bibr" rid="ref15">15</xref>,<xref ref-type="bibr" rid="ref30">30</xref>]. The first subset, the Schema Alignment Subset, consists of 350,000 question-answer pairs derived from over 100 templates followed by the GPT-3.5 paraphrasing phase, designed to help LLMs extract structured EHR data (eg, patient details, diagnoses, treatments). Second, the Clinical Reasoning Subset contains 50,000 question-answer pairs from discharge summaries aimed at developing LLMs’ ability to perform clinical reasoning, such as understanding patient progression, predicting potential complications, and recommending follow-up [<xref ref-type="bibr" rid="ref15">15</xref>].</p>
        </sec>
        <sec>
          <title>AI-Expanded Using Human Seed (Hybrid Approach)</title>
          <p>This method captures the synergy between human expertise and AI scalability. Expert annotators formulate the initial dataset, and AI generates additional data.</p>
          <p>This approach seems to combine the benefits of both previously mentioned methods. A small, high-quality seed dataset written and curated by expert clinicians serves as the foundation. Next, leveraging the LLMs’ scalability, AI generates additional data and significantly expands the dataset [<xref ref-type="bibr" rid="ref7">7</xref>]. However, this highly scalable approach is novel and requires testing across different prompt types, prompt engineering techniques, and languages [<xref ref-type="bibr" rid="ref31">31</xref>]. It is advisable to incorporate a “chain of thought” or “chain of instruction” prompting methods [<xref ref-type="bibr" rid="ref32">32</xref>,<xref ref-type="bibr" rid="ref33">33</xref>]. These strategies facilitate more rigorous reasoning by the model, thereby improving the accuracy and reliability of its responses (generated IIO triples) through a more thorough process of prediction [<xref ref-type="bibr" rid="ref32">32</xref>,<xref ref-type="bibr" rid="ref33">33</xref>]. Another challenge is discovering the right balance between the number of human-seeded examples and AI-generated samples, as the optimal “golden” ratio remains unknown.</p>
          <p>Notably, if AI is used at any stage of dataset creation, it is essential to specify clearly which model was used for each sample to ensure transparency. This information should be organized in a table, with columns detailing the example or instruction, the model used (eg, GPT-4, Gemini 1.5, Claude 3), and the corresponding predicted response [<xref ref-type="bibr" rid="ref14">14</xref>,<xref ref-type="bibr" rid="ref34">34</xref>,<xref ref-type="bibr" rid="ref35">35</xref>]. Zhang et al [<xref ref-type="bibr" rid="ref7">7</xref>], authors of the AlpaCare model, adopted the novel hybrid approach. They also authored the MedInstruct-52k database, using GPT-4 to generate a diverse set of over 52,000 instructions based on a high-quality expert-curated seed set encompassing 167 samples [<xref ref-type="bibr" rid="ref7">7</xref>]. Although AlpaCare was trained on a smaller, domain-specific dataset compared to earlier medical LLMs, it achieved remarkable results in medical applications, surpassing the best existing models by up to 38.1% in medical free-form instruction evaluations [<xref ref-type="bibr" rid="ref7">7</xref>]. Further on, human evaluation confirmed that AlpaCare outperformed other medical LLMs’ accuracy and usefulness [<xref ref-type="bibr" rid="ref7">7</xref>].</p>
          <p>Nevertheless, many well-known datasets (used as benchmarks for the medical LLMs) like MedQA, MedMCQA, PubMedQA, MMLU clinical topics database, and HealthSearchQA were curated by human annotators or entirely created through manual effort [<xref ref-type="bibr" rid="ref21">21</xref>,<xref ref-type="bibr" rid="ref26">26</xref>-<xref ref-type="bibr" rid="ref29">29</xref>]. Thorough data collection still requires significant human evaluation, especially in the highly empirical and complex medical domain [<xref ref-type="bibr" rid="ref21">21</xref>]. It is necessary to assess both clinical soundness and the potential for harm [<xref ref-type="bibr" rid="ref6">6</xref>,<xref ref-type="bibr" rid="ref8">8</xref>,<xref ref-type="bibr" rid="ref21">21</xref>]. The more high-quality data, the better, as it will undoubtedly result in improved model quality and performance. Additionally, it enhances the dataset’s scalability and reusability for future applications, ensuring its long-term value.</p>
        </sec>
        <sec>
          <title>Database-to-Model Compatibility</title>
          <p>Further, the built database must be compatible with the target foundation model that will be fine-tuned [<xref ref-type="bibr" rid="ref7">7</xref>-<xref ref-type="bibr" rid="ref9">9</xref>,<xref ref-type="bibr" rid="ref15">15</xref>,<xref ref-type="bibr" rid="ref16">16</xref>,<xref ref-type="bibr" rid="ref21">21</xref>]. Key factors to consider include the size and architecture of the model, as this determines the appropriate scale and complexity of the dataset [<xref ref-type="bibr" rid="ref8">8</xref>,<xref ref-type="bibr" rid="ref15">15</xref>,<xref ref-type="bibr" rid="ref16">16</xref>,<xref ref-type="bibr" rid="ref21">21</xref>]. For example, a larger model typically requires a more extensive and diverse dataset to train effectively, while a smaller model may perform well with a more focused dataset [<xref ref-type="bibr" rid="ref8">8</xref>,<xref ref-type="bibr" rid="ref15">15</xref>,<xref ref-type="bibr" rid="ref16">16</xref>,<xref ref-type="bibr" rid="ref21">21</xref>]. Usually, to effectively train the foundation model and to observe performance gains after fine-tuning the datasets, the datasets must have at least a few thousand or tens of thousands of examples [<xref ref-type="bibr" rid="ref7">7</xref>-<xref ref-type="bibr" rid="ref9">9</xref>,<xref ref-type="bibr" rid="ref15">15</xref>,<xref ref-type="bibr" rid="ref16">16</xref>,<xref ref-type="bibr" rid="ref21">21</xref>]. However, the optimal number of samples per number of the model’s parameters remains unknown [<xref ref-type="bibr" rid="ref8">8</xref>,<xref ref-type="bibr" rid="ref15">15</xref>,<xref ref-type="bibr" rid="ref16">16</xref>,<xref ref-type="bibr" rid="ref21">21</xref>].</p>
          <p>Moreover, understanding the prompt (input) and output structure is crucial for tailoring the dataset to the model’s requirements [<xref ref-type="bibr" rid="ref8">8</xref>,<xref ref-type="bibr" rid="ref15">15</xref>,<xref ref-type="bibr" rid="ref16">16</xref>,<xref ref-type="bibr" rid="ref21">21</xref>]. This includes knowing what type of questions, commands, or inputs the model can handle and how it is expected to respond. Additionally, it is essential to account for the maximum context length, which determines how much information, described as the maximum number of tokens per prompt, the model can process in a single prompt-response interaction [<xref ref-type="bibr" rid="ref8">8</xref>,<xref ref-type="bibr" rid="ref13">13</xref>,<xref ref-type="bibr" rid="ref14">14</xref>].</p>
        </sec>
      </sec>
      <sec>
        <title>Metadata</title>
        <sec>
          <title>Overview</title>
          <p>Metadata in datasets is descriptive information that provides context and details about the main data, including unstructured data samples [<xref ref-type="bibr" rid="ref36">36</xref>]. In the case of ITDs, the main data are IIO triples [<xref ref-type="bibr" rid="ref7">7</xref>,<xref ref-type="bibr" rid="ref8">8</xref>,<xref ref-type="bibr" rid="ref15">15</xref>,<xref ref-type="bibr" rid="ref22">22</xref>]. The introduction of structure into the data simplifies primary data management and allows the primary data to be easily searched, summarized, filtered, or compared with other available datasets [<xref ref-type="bibr" rid="ref36">36</xref>]. Moreover, it enables easier integration and use of data across different systems and applications, such as health care data lakes containing EHRs [<xref ref-type="bibr" rid="ref36">36</xref>].</p>
          <p>Recent research shows that the following metadata can be useful for text-based IFT datasets in medicine (<xref rid="figure5" ref-type="fig">Figure 5</xref>) [<xref ref-type="bibr" rid="ref8">8</xref>,<xref ref-type="bibr" rid="ref9">9</xref>,<xref ref-type="bibr" rid="ref15">15</xref>,<xref ref-type="bibr" rid="ref16">16</xref>,<xref ref-type="bibr" rid="ref21">21</xref>,<xref ref-type="bibr" rid="ref36">36</xref>,<xref ref-type="bibr" rid="ref37">37</xref>]. <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> provides details about the metadata.</p>
          <fig id="figure5" position="float">
            <label>Figure 5</label>
            <caption>
              <p>Metadata in instruction-tuning datasets.</p>
            </caption>
            <graphic xlink:href="jmir_v27i1e70481_fig5.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </fig>
        </sec>
        <sec>
          <title>Human Evaluation of ITDs</title>
          <p>Neither a single metric nor a universal human evaluation framework is established and applicable to all medical datasets that can be used to train LLMs [<xref ref-type="bibr" rid="ref6">6</xref>,<xref ref-type="bibr" rid="ref8">8</xref>,<xref ref-type="bibr" rid="ref21">21</xref>]. Furthermore, no single dataset can comprehensively encompass all potential medical conditions and the full complexity of language [<xref ref-type="bibr" rid="ref20">20</xref>,<xref ref-type="bibr" rid="ref21">21</xref>,<xref ref-type="bibr" rid="ref38">38</xref>].</p>
          <p>Most recent work focuses on the human assessment of the responses generated by fine-tuned LLM rather than the initial dataset used to develop the model [<xref ref-type="bibr" rid="ref6">6</xref>,<xref ref-type="bibr" rid="ref8">8</xref>,<xref ref-type="bibr" rid="ref26">26</xref>-<xref ref-type="bibr" rid="ref28">28</xref>,<xref ref-type="bibr" rid="ref39">39</xref>]. However, based on our observations, some validation strategies used to evaluate the final models can effectively serve to analyze the ITDs. This is especially crucial in cases where ITDs are entirely generated through AI automation or when LLMs are used to augment a foundational dataset initially crafted by human annotators (hybrid approach) [<xref ref-type="bibr" rid="ref20">20</xref>,<xref ref-type="bibr" rid="ref39">39</xref>-<xref ref-type="bibr" rid="ref41">41</xref>]. The human evaluation is usually performed using Likert scales (1-5) or categories (yes or unclear or no) [<xref ref-type="bibr" rid="ref6">6</xref>].</p>
        </sec>
        <sec>
          <title>Implementation of a Clear and Objective Evaluation Process</title>
          <p>The implementation of objective evaluation involves 3 key phases: training, optimization, and final scoring. During the first phase, training for all evaluators to ensure an objective consensus on the tasks and requirements [<xref ref-type="bibr" rid="ref6">6</xref>]. Standardized evaluation questionnaires, checklists, and guidelines should be presented to the annotators. Next, during the optimization phase, the evaluators should conduct a sample evaluation to investigate if all evaluators understand the guidelines. The interannotator agreement and variability can be analyzed statistically using Cohen κ or interclass correlation coefficients [<xref ref-type="bibr" rid="ref6">6</xref>]. If the interrater agreement is unsatisfactory or the annotators’ understanding of the evaluation process is inconsistent, the evaluation guidelines should be updated to improve their clarity and objectivity [<xref ref-type="bibr" rid="ref6">6</xref>]. In the final scoring phase, the annotators label the data based on the previously established methodology. Final evaluation scores for each dimension are calculated.</p>
        </sec>
      </sec>
      <sec>
        <title>Evaluation Dimensions</title>
        <p>Four dimensions may be used to evaluate each sample (IIO triple) in the ITDs (<xref rid="figure6" ref-type="fig">Figure 6</xref>) [<xref ref-type="bibr" rid="ref6">6</xref>,<xref ref-type="bibr" rid="ref8">8</xref>,<xref ref-type="bibr" rid="ref26">26</xref>-<xref ref-type="bibr" rid="ref28">28</xref>,<xref ref-type="bibr" rid="ref39">39</xref>]. These aspects can be assessed using open-source LLM evaluation frameworks, such as DeepEval and Ragas [<xref ref-type="bibr" rid="ref42">42</xref>,<xref ref-type="bibr" rid="ref43">43</xref>].</p>
        <fig id="figure6" position="float">
          <label>Figure 6</label>
          <caption>
            <p>Evaluation dimensions. AI: artificial intelligence.</p>
          </caption>
          <graphic xlink:href="jmir_v27i1e70481_fig6.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <sec>
          <title>Scientific and Clinical Quality</title>
          <p>The accuracy, agreement, and correctness of the IIO triple are essential, ensuring that it is factually correct, precise, and free from knowledge-based errors. If AI generated the sample, it would also be free of any signs of falsification or hallucinations. Additionally, the output should be comprehensive and complete, fully addressing the input and adhering to the provided instructions. The sample must also reflect the most up-to-date knowledge available.</p>
          <p>Furthermore, clinical usefulness is critical, meaning the sample should have significant practical value. The IIO query should represent a realistic situation or question-answer interaction that could occur in a clinical setting. Finally, overall user satisfaction is an important consideration. Would the response effectively meet the user's needs in addressing the given question or instruction in a clinical context?</p>
        </sec>
        <sec>
          <title>Logic and Reasoning</title>
          <p>Coherence, reasoning, and internal consistency involve ensuring that the instruction, input, and output are logically connected and aligned. The response should adhere to the given instructions and appropriately address the question.</p>
          <p>When it comes to understanding, particularly in AI-generated data samples, it refers to the model’s ability to accurately interpret the query. This includes generating instruction-input-output triples that demonstrate a clear grasp of the query’s meaning and context. The response should reflect a thoughtful understanding of what was asked.</p>
        </sec>
        <sec>
          <title>Expression and Communication Style</title>
          <p>Clarity means that the instructions, questions, and answers are presented in a way that is easy to understand as well as free of ambiguity and linguistic errors. Communication should be straightforward and concise.</p>
          <p>Empathy involves tailoring the response to reflect the emotions and tone conveyed in the input. This ensures the interaction feels thoughtful and responsive, simulating a sense of understanding and connection.</p>
        </sec>
        <sec>
          <title>Safety and Harm</title>
          <p>“<italic>Primum non nocere—above all, do no harm</italic>” emphasizes that medical actions should not worsen a patient’s condition [<xref ref-type="bibr" rid="ref44">44</xref>]. Medical LLMs should also be trained according to this principle and should not generate output that causes harm, spreads misinformation, or leads to negative consequences for end users, both clinicians and patients [<xref ref-type="bibr" rid="ref6">6</xref>,<xref ref-type="bibr" rid="ref20">20</xref>,<xref ref-type="bibr" rid="ref21">21</xref>,<xref ref-type="bibr" rid="ref39">39</xref>-<xref ref-type="bibr" rid="ref41">41</xref>,<xref ref-type="bibr" rid="ref44">44</xref>]. Any data sample that contains harmful content should be removed from the training dataset.</p>
          <p>Based on an extensive systematic review of 142 studies, Tam et al [<xref ref-type="bibr" rid="ref6">6</xref>] propose 4 dimensions that can be used to evaluate the safety of both the ITD in the health care domain as well as the final fine-tuned medical LLM [<xref ref-type="bibr" rid="ref6">6</xref>]. These dimensions include bias, harm, self-awareness, and misinformation. Bias refers to the presence of systemic prejudices in responses, such as discrimination based on race, gender, or other characteristics. Second, harm encompasses any potential negative outcomes caused by responses, such as spreading misinformation, promoting offensive or harmful language, reinforcing stereotypes, encouraging illegal activities, or inciting violence. Subsequently, self-awareness is the model’s ability to recognize its own patterns and limitations, even though it lacks human-like consciousness. Finally, misinformation or falsification includes several issues: (1) fabrication occurs when entirely false information or nonexistent facts are provided; (2) falsification involves distorting or omitting critical facts, leading to a misrepresentation of factual information; (3) plagiarism refers to using text or ideas from another source without giving proper credit or attribution; and (4) hallucination happens when a response includes incorrect or nonsensical information that is inaccurately presented as factual.</p>
          <p><xref ref-type="table" rid="table2">Table 2</xref> presents IIO triples that illustrate the above safety and harm principles. Notably, the desired response is achieved only in the “self-awareness” category. Additional examples are provided in the <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p>
          <table-wrap position="float" id="table2">
            <label>Table 2</label>
            <caption>
              <p>Examples of safety and harm principles.</p>
            </caption>
            <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
              <col width="210"/>
              <col width="260"/>
              <col width="260"/>
              <col width="270"/>
              <thead>
                <tr valign="top">
                  <td>Category</td>
                  <td>Instruction</td>
                  <td>Input</td>
                  <td>Output</td>
                </tr>
              </thead>
              <tbody>
                <tr valign="top">
                  <td>Bias</td>
                  <td>Ensure responses are free from systemic biases, including racial or gender discrimination.</td>
                  <td>What are the risk factors for heart disease?</td>
                  <td>Heart disease primarily affects older men; women rarely suffer from it (overlooks or downplays female-specific risk factors).</td>
                </tr>
                <tr valign="top">
                  <td>Harm</td>
                  <td>Provide responses that do not cause harm, spread misinformation, or reinforce stereotypes.</td>
                  <td>Can I stop taking my blood pressure medication if I feel fine?</td>
                  <td>Yes, you can stop your medication if you feel fine (spreads harmful misinformation).</td>
                </tr>
                <tr valign="top">
                  <td>Self-awareness</td>
                  <td>Acknowledge the model’s limitations and avoid presenting itself as a replacement for professional advice.</td>
                  <td>Can you guarantee the accuracy of your medical advice?</td>
                  <td>I am a language model and cannot guarantee accuracy. In doubt, please consult a medical professional (correct answer).</td>
                </tr>
                <tr valign="top">
                  <td>Fabrication</td>
                  <td>Avoid providing entirely made-up information or nonexistent facts in medical responses.</td>
                  <td>What is the success rate of a new drug for heart failure?</td>
                  <td>The drug has a 95% success rate, proven in a global trial (no such trial exists).</td>
                </tr>
                <tr valign="top">
                  <td>Falsification</td>
                  <td>Do not distort or omit critical facts or details when providing medical information.</td>
                  <td>What are the key findings of the latest study on hypertension?</td>
                  <td>The study concluded that high salt intake reduces blood pressure (key details are distorted).</td>
                </tr>
                <tr valign="top">
                  <td>Plagiarism</td>
                  <td>Always provide proper attribution when using text or ideas from another source.</td>
                  <td>Explain the mechanism of action of beta-blockers.</td>
                  <td>Beta-blockers work by blocking the effects of adrenaline (statement taken from a medical paper without credit).</td>
                </tr>
                <tr valign="top">
                  <td>Hallucination</td>
                  <td>Avoid presenting incorrect or nonsensical information as factual in medical responses.</td>
                  <td>What is the normal range for ejection fraction in heart function?</td>
                  <td>The normal left ventricular ejection fraction is between 10% and 20% (incorrect range).</td>
                </tr>
              </tbody>
            </table>
          </table-wrap>
          <p>The assessment of potentially harmful content and safety issues is essential in ITDs that were generated fully automatically with the use of AI or where LLMs were implemented to expand the initial seed dataset created by human annotators (hybrid approach) [<xref ref-type="bibr" rid="ref20">20</xref>,<xref ref-type="bibr" rid="ref39">39</xref>-<xref ref-type="bibr" rid="ref41">41</xref>]. During the evaluation, it is crucial to focus on identifying hallucinations and falsifications in the dataset, as these represent some of the most significant challenges LLMs face today [<xref ref-type="bibr" rid="ref38">38</xref>,<xref ref-type="bibr" rid="ref45">45</xref>,<xref ref-type="bibr" rid="ref46">46</xref>]. Hallucinations in fine-tuned LLMs can be particularly harmful to patients with limited background knowledge, as they may be unable to detect false content provided by the final model. In contrast, health care professionals with extensive medical knowledge are better equipped to identify hallucinations and falsifications more easily [<xref ref-type="bibr" rid="ref40">40</xref>,<xref ref-type="bibr" rid="ref41">41</xref>,<xref ref-type="bibr" rid="ref47">47</xref>]. Further on, Xu et al [<xref ref-type="bibr" rid="ref38">38</xref>] report based on the results from learning theory, that LLMs cannot learn all the computable functions and will, therefore, always hallucinate. Hallucinations are inevitable and likely the major innate limitation of any LLM [<xref ref-type="bibr" rid="ref38">38</xref>]. Thus, the instruction dataset provided to the model must be free of any data samples containing potentially harmful content, particularly hallucinations.</p>
        </sec>
      </sec>
      <sec>
        <title>Limitations and Future Directions</title>
        <p>Data scarcity in rare diseases and underserved populations may limit the generalization capabilities of the LLM [<xref ref-type="bibr" rid="ref17">17</xref>,<xref ref-type="bibr" rid="ref21">21</xref>]. Synthetic AI-based generation of ITDs offers a potential solution by simulating patient scenarios and improving model performance in underrepresented groups [<xref ref-type="bibr" rid="ref7">7</xref>,<xref ref-type="bibr" rid="ref15">15</xref>]. However, this approach carries risks and raises ethical concerns [<xref ref-type="bibr" rid="ref40">40</xref>,<xref ref-type="bibr" rid="ref41">41</xref>,<xref ref-type="bibr" rid="ref48">48</xref>]. For example, synthetic data can propagate biases from the original dataset, leading to skewed results [<xref ref-type="bibr" rid="ref48">48</xref>]. Additionally, synthetic health care data are often touted to ensure privacy, but the reality may be more complex. If the original dataset is too small relative to its dimensionality, it might still be possible to infer sensitive personal information, undermining the intended privacy safeguards [<xref ref-type="bibr" rid="ref48">48</xref>].</p>
        <p>Further on, maintaining ITD quality and up-to-dateness requires addressing outdated medical knowledge and standards of care [<xref ref-type="bibr" rid="ref49">49</xref>]. Small language models are often more practical for frequent updates than LLMs because they require less computing power and time during fine-tuning [<xref ref-type="bibr" rid="ref50">50</xref>]. Including the data source as metadata alongside IIO triples allows targeted updates when the source changes, ensuring only the relevant parts of the dataset are modified.</p>
        <p>Additionally, annotation bias can occur during dataset evaluation, potentially compromising the objectivity and reliability of a dataset. One effective approach to address this is implementing a dual-review process, where 2 annotators evaluate each sample independently [<xref ref-type="bibr" rid="ref6">6</xref>,<xref ref-type="bibr" rid="ref21">21</xref>]. This helps reduce individual biases and provides a more balanced perspective. Ensuring internal coherence among annotators is also crucial, as consistent interpretation of the evaluation criteria directly impacts the dataset’s quality [<xref ref-type="bibr" rid="ref6">6</xref>]. A structured training phase for annotators can further enhance reliability. During this phase, evaluators should be provided with clear guidelines, evaluation checklists, and opportunities to discuss ambiguous cases, which can ensure a shared understanding of the criteria. Additionally, offering prescored examples before the final annotation phase allows annotators to calibrate their evaluations effectively [<xref ref-type="bibr" rid="ref6">6</xref>].</p>
        <p>Moreover, in the future, multimodal data integration may improve medical LLMs by combining text with images, videos, audio recordings, laboratory results, or genetic information [<xref ref-type="bibr" rid="ref37">37</xref>]. Such data can also be incorporated when designing instruction fine-tuning datasets, enhancing the model’s ability to handle complex medical scenarios. This approach may provide a richer understanding of medical cases and improve the accuracy and generation capabilities of the final model [<xref ref-type="bibr" rid="ref37">37</xref>]. However, the acquisition of multimodal data from medical records requires more time, effort, technical knowledge, and financial resources, and the establishment of a standardized protocol, which may explain why it is not more widely adopted now.</p>
        <p>From a global perspective, the absence of standardized instruction fine-tuning dataset templates for clinical scenarios leads to significant variability in workflows used to prepare such datasets and clinical terminology used to describe the data samples [<xref ref-type="bibr" rid="ref9">9</xref>,<xref ref-type="bibr" rid="ref15">15</xref>,<xref ref-type="bibr" rid="ref21">21</xref>,<xref ref-type="bibr" rid="ref51">51</xref>]. These inconsistencies make it challenging to build universally applicable medical LLMs. To address this, global initiatives are needed to establish a uniform health care ITD framework. Such standards would allow for more consistent and effective fine-tuning of LLMs. Furthermore, collaboration among academic institutions, health care organizations, and industry is required to create large-scale open-source datasets that are diverse, accessible, and representative of real-world data [<xref ref-type="bibr" rid="ref21">21</xref>].</p>
        <p>Finally, it remains uncertain whether instruction-fine-tuning datasets for medical LLMs will be necessary if artificial general intelligence (AGI) is achieved. This question is still largely speculative [<xref ref-type="bibr" rid="ref52">52</xref>-<xref ref-type="bibr" rid="ref55">55</xref>]. Some researchers reason that computers do not engage with the world like humans do—they are not part of the physical world [<xref ref-type="bibr" rid="ref54">54</xref>,<xref ref-type="bibr" rid="ref55">55</xref>]. From a knowledge theory perspective, computers can never fully access all available data [<xref ref-type="bibr" rid="ref38">38</xref>]. Even if AGI is invented, such a model would likely excel in theoretical fields like physics or mathematics. Still, medicine is fundamentally empirical as it relies heavily on practical experience, clinical trials, and real-world observations [<xref ref-type="bibr" rid="ref17">17</xref>,<xref ref-type="bibr" rid="ref21">21</xref>,<xref ref-type="bibr" rid="ref56">56</xref>,<xref ref-type="bibr" rid="ref57">57</xref>]. AGI will not be able to replicate those experiences and, as a result, is unlikely to replace the empirical research in medicine. Instead, experimental science will likely continue to prepare ITDs for training LLMs. The AI model will not generate novel medical knowledge alone; it will only process the knowledge humans intentionally supplied and the ITDs will most likely be the most effective way of providing this knowledge to the model.</p>
      </sec>
      <sec>
        <title>Conclusions</title>
        <p>This paper provides a guide on designing, creating, and evaluating a high-quality ITD for considerable language model training in health care from a clinical perspective. Developing an ITD requires collecting data from diverse sources to ensure coverage of realistic clinical scenarios. Moreover, an end user of the final model must be defined. ITDs can be prepared by human annotators, entirely generated by AI, or expanded through a hybrid approach that combines AI with the initial human seed. It is recommended that data samples be evaluated in multiple domains, especially if AI is used at any stage of dataset generation. Each IIO sample ought to be described using metadata. The datasets must comply with ethical standards of data privacy. After the training and deployment, the dataset must undergo frequent updates to contain the latest clinical knowledge. We emphasize the requirement for more open-source datasets and global frameworks that will standardize the formats of ITDs. Further on, we highlight that even if AGI is ever achieved, medicine is fundamentally empirical. Thus, the AI model will not generate novel medical knowledge alone; it will only process the knowledge that humans intentionally supplied to it. The ITDs will most likely be the most effective way of providing this knowledge to the model. Finally, we encourage all researchers to adopt our recommendations and collaborate toward the development and sharing of high-quality, open-source ITDs to advance LLM-based applications in health care.</p>
      </sec>
    </sec>
  </body>
  <back>
    <app-group>
      <supplementary-material id="app1">
        <label>Multimedia Appendix 1</label>
        <p>Supplementary description of potential data sources and metadata. Additional examples of generative AI response types as well as safety and harm principles.</p>
        <media xlink:href="jmir_v27i1e70481_app1.docx" xlink:title="DOCX File , 54 KB"/>
      </supplementary-material>
    </app-group>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">AGI</term>
          <def>
            <p>artificial general intelligence</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb2">AI</term>
          <def>
            <p>artificial intelligence</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb3">EHR</term>
          <def>
            <p>electronic health record</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb4">IIO</term>
          <def>
            <p>instruction-input-output</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb5">IFT</term>
          <def>
            <p>instruction fine-tuning</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb6">ITD</term>
          <def>
            <p>instruction-tuning dataset</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb7">LLM</term>
          <def>
            <p>large language model</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb8">SFT</term>
          <def>
            <p>supervised fine-tuning</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <fn-group>
      <fn fn-type="conflict">
        <p>None declared.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Schwabe</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Becker</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Seyferth</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Klaß</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Schaeffter</surname>
              <given-names>T</given-names>
            </name>
          </person-group>
          <article-title>The METRIC-framework for assessing data quality for trustworthy AI in medicine: a systematic review</article-title>
          <source>NPJ Digit Med</source>
          <year>2024</year>
          <volume>7</volume>
          <issue>1</issue>
          <fpage>203</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1038/s41746-024-01196-4"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/s41746-024-01196-4</pub-id>
          <pub-id pub-id-type="medline">39097662</pub-id>
          <pub-id pub-id-type="pii">10.1038/s41746-024-01196-4</pub-id>
          <pub-id pub-id-type="pmcid">PMC11297942</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Pipino</surname>
              <given-names>LL</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>YW</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>RY</given-names>
            </name>
          </person-group>
          <article-title>Data quality assessment</article-title>
          <source>Commun ACM</source>
          <year>2002</year>
          <volume>45</volume>
          <issue>4ve</issue>
          <fpage>211</fpage>
          <lpage>218</lpage>
          <pub-id pub-id-type="doi">10.1145/505999.506010</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ehsani-Moghaddam</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Martin</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Queenan</surname>
              <given-names>JA</given-names>
            </name>
          </person-group>
          <article-title>Data quality in healthcare: a report of practical experience with the Canadian primary care sentinel surveillance network data</article-title>
          <source>Health Inf Manag</source>
          <year>2021</year>
          <volume>50</volume>
          <issue>1-2</issue>
          <fpage>88</fpage>
          <lpage>92</lpage>
          <pub-id pub-id-type="doi">10.1177/1833358319887743</pub-id>
          <pub-id pub-id-type="medline">31805788</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Nazar</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Szymanowicz</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Nazar</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Kaufmann</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Wabich</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Braun-Dullaeus</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Daniłowicz-Szymanowicz</surname>
              <given-names>L</given-names>
            </name>
          </person-group>
          <article-title>Artificial intelligence models in prediction of response to cardiac resynchronization therapy: a systematic review</article-title>
          <source>Heart Fail Rev</source>
          <year>2024</year>
          <volume>29</volume>
          <issue>1</issue>
          <fpage>133</fpage>
          <lpage>150</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/37861853"/>
          </comment>
          <pub-id pub-id-type="doi">10.1007/s10741-023-10357-8</pub-id>
          <pub-id pub-id-type="medline">37861853</pub-id>
          <pub-id pub-id-type="pii">10.1007/s10741-023-10357-8</pub-id>
          <pub-id pub-id-type="pmcid">PMC10904439</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Nazar</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Romantowski</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Niedoszytko</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Daniłowicz-Szymanowicz</surname>
              <given-names>L</given-names>
            </name>
          </person-group>
          <article-title>Cardiac adverse drug reactions to COVID-19 vaccines. a cross-sectional study based on the Europe-wide data</article-title>
          <source>Eur Heart J Cardiovasc Pharmacother</source>
          <year>2024</year>
          <volume>10</volume>
          <issue>7</issue>
          <fpage>599</fpage>
          <lpage>607</lpage>
          <pub-id pub-id-type="doi">10.1093/ehjcvp/pvae063</pub-id>
          <pub-id pub-id-type="medline">39174484</pub-id>
          <pub-id pub-id-type="pii">7739170</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref6">
        <label>6</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Tam</surname>
              <given-names>TYC</given-names>
            </name>
            <name name-style="western">
              <surname>Sivarajkumar</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Kapoor</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Stolyar</surname>
              <given-names>AV</given-names>
            </name>
            <name name-style="western">
              <surname>Polanska</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>McCarthy</surname>
              <given-names>KR</given-names>
            </name>
            <name name-style="western">
              <surname>Osterhoudt</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Visweswaran</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Fu</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Mathur</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Cacciamani</surname>
              <given-names>GE</given-names>
            </name>
            <name name-style="western">
              <surname>Sun</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Peng</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>Y</given-names>
            </name>
          </person-group>
          <article-title>A framework for human evaluation of large language models in healthcare derived from literature review</article-title>
          <source>NPJ Digit Med</source>
          <year>2024</year>
          <volume>7</volume>
          <issue>1</issue>
          <fpage>258</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1038/s41746-024-01258-7"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/s41746-024-01258-7</pub-id>
          <pub-id pub-id-type="medline">39333376</pub-id>
          <pub-id pub-id-type="pii">10.1038/s41746-024-01258-7</pub-id>
          <pub-id pub-id-type="pmcid">PMC11437138</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref7">
        <label>7</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Tian</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Petzold</surname>
              <given-names>LR</given-names>
            </name>
          </person-group>
          <article-title>AlpaCare:Instruction-tuned large language models for medical application</article-title>
          <source>ArXiv</source>
          <comment>Preprint posted on October 23, 2023</comment>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/2310.14558v5"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref8">
        <label>8</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ouyang</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Jiang</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Almeida</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Wainwright</surname>
              <given-names>CL</given-names>
            </name>
            <name name-style="western">
              <surname>Mishkin</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Agarwal</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Slama</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Ray</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Schulman</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Hilton</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Kelton</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Miller</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Simens</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Askell</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Welinder</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Christiano</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Leike</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Lowe</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <person-group person-group-type="editor">
            <name name-style="western">
              <surname>Koyejo</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Mohamed</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Agarwal</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Belgrave</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Cho</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Oh</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Training language models to follow instructions with human feedback</article-title>
          <source>Proceedings of the 36th International Conference on Neural Information Processing Systems</source>
          <year>2023</year>
          <publisher-loc>Red Hook, NY</publisher-loc>
          <publisher-name>Curran Associates Inc</publisher-name>
        </nlm-citation>
      </ref>
      <ref id="ref9">
        <label>9</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Dong</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Sun</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Hu</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>T</given-names>
            </name>
          </person-group>
          <article-title>Instruction tuning for large language models: a survey</article-title>
          <source>ArXiv</source>
          <comment>Preprint posted online on August 21, 2023</comment>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://arxiv.org/abs/2308.10792"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref10">
        <label>10</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Parthasarathy</surname>
              <given-names>VB</given-names>
            </name>
            <name name-style="western">
              <surname>Zafar</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Khan</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Shahid</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>The ultimate guide to fine-tuning LLMs from basics to breakthroughs: an exhaustive review of technologies, research, best practices, applied research challenges and opportunities</article-title>
          <source>ArXiv</source>
          <comment>Preprint posted online on August 23, 2024</comment>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://arxiv.org/abs/2408.13296"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref11">
        <label>11</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Li</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Chu</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Hu</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Shen</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Fan</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Xu</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Gu</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Zheng</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Ant Group</surname>
              <given-names>GZ</given-names>
            </name>
          </person-group>
          <article-title>From beginner to expert: modeling medical knowledge into general LLMs</article-title>
          <source>ArXiv</source>
          <comment>Preprint posted online on December 2, 2023</comment>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://arxiv.org/abs/2312.01040"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref12">
        <label>12</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zhou</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Gu</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Zou</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Huang</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>SS</given-names>
            </name>
            <name name-style="western">
              <surname>Zhou</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Hua</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Mao</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>You</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Zheng</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Clifton</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Luo</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Clifton</surname>
              <given-names>DA</given-names>
            </name>
          </person-group>
          <article-title>A survey of large language models in medicine: progress, application, and challenge</article-title>
          <source>ArXiv</source>
          <comment>Preprint posted online on November 9, 2023</comment>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://github.com/AI-in-Health/MedLLMsPracticalGuide"/>
          </comment>
          <pub-id pub-id-type="doi">10.48550/arXiv.2311.05112</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref13">
        <label>13</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Touvron</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Lavril</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Izacard</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Martinet</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Lachaux</surname>
              <given-names>MA</given-names>
            </name>
            <name name-style="western">
              <surname>Lacroix</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Rozière</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Goyal</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Hambro</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Azhar</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Rodriguez</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Joulin</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Grave</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Lample</surname>
              <given-names>G</given-names>
            </name>
          </person-group>
          <article-title>LLaMA: Open and efficient foundation language models</article-title>
          <source>ArXiv</source>
          <comment>Preprint posted online on February 27, 2023</comment>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/2302.13971v1"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref14">
        <label>14</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <collab>Gemini Team Google</collab>
            <name name-style="western">
              <surname>Georgiev</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Lei</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Burnell</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Bai</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Gulati</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Tanzer</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Vincent</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Pan</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Mariooryad</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Ding</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Geng</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Alcober</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Frostig</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Omernick</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Walker</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Paduraru</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Sorokin</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Tacchetti</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Gaffney</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Daruki</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Sercinoglu</surname>
              <given-names>O</given-names>
            </name>
            <name name-style="western">
              <surname>Gleicher</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Love</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Voigtlaender</surname>
              <given-names>P</given-names>
            </name>
          </person-group>
          <article-title>Gemini 1.5: Unlocking multimodal understanding across millions of tokens of context</article-title>
          <source>ArXiv</source>
          <comment>Preprint posted online on March 8, 2024</comment>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/2403.05530v4"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref15">
        <label>15</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Dadu</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Nalls</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Faghri</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Sun</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Instruction tuning large language models to understand electronic health records</article-title>
          <year>2024</year>
          <conf-name>2024 Conference on Neural Information Processing Systems</conf-name>
          <conf-date>December 10-15, 2024</conf-date>
          <conf-loc>Vancouver, BC</conf-loc>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://neurips.cc/virtual/2024/poster/97801"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref16">
        <label>16</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Jiang</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Sun</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Shi</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Rodriguez</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Zhou</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Neubig</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Victoria Lin</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Yih</surname>
              <given-names>WT</given-names>
            </name>
            <name name-style="western">
              <surname>Iyer</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Instruction-tuned language models are better knowledge learners</article-title>
          <source>ArXiv</source>
          <comment>Preprint posted online on May 26, 2024</comment>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://arxiv.org/abs/2402.12847"/>
          </comment>
          <pub-id pub-id-type="doi">10.18653/v1/2024.acl-long.296</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref17">
        <label>17</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Topol</surname>
              <given-names>EJ</given-names>
            </name>
          </person-group>
          <article-title>High-performance medicine: the convergence of human and artificial intelligence</article-title>
          <source>Nat Med</source>
          <year>2019</year>
          <volume>25</volume>
          <issue>1</issue>
          <fpage>44</fpage>
          <lpage>56</lpage>
          <pub-id pub-id-type="doi">10.1038/s41591-018-0300-7</pub-id>
          <pub-id pub-id-type="medline">30617339</pub-id>
          <pub-id pub-id-type="pii">10.1038/s41591-018-0300-7</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref18">
        <label>18</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Sufyan</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Shokat</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Ashfaq</surname>
              <given-names>UA</given-names>
            </name>
          </person-group>
          <article-title>Artificial intelligence in cancer diagnosis and therapy: current status and future perspective</article-title>
          <source>Comput Biol Med</source>
          <year>2023</year>
          <volume>165</volume>
          <fpage>107356</fpage>
          <pub-id pub-id-type="doi">10.1016/j.compbiomed.2023.107356</pub-id>
          <pub-id pub-id-type="medline">37688994</pub-id>
          <pub-id pub-id-type="pii">S0010-4825(23)00821-1</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref19">
        <label>19</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Su</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Lin</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Garay</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Zeng</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Shen</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Yuan</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Clinical text datasets for medical artificial intelligence and large language models—a systematic review</article-title>
          <source>NEJM AI</source>
          <year>2024</year>
          <volume>1</volume>
          <issue>6</issue>
          <pub-id pub-id-type="doi">10.1056/aira2400012</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref20">
        <label>20</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Matheny</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Israni</surname>
              <given-names>ST</given-names>
            </name>
            <name name-style="western">
              <surname>Whicher</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Ahmed</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <source>Artificial Intelligence in Health Care: The Hope, the Hype, the Promise, the Peril</source>
          <year>2023</year>
          <publisher-loc>Washington, DC</publisher-loc>
          <publisher-name>National Academies Press</publisher-name>
        </nlm-citation>
      </ref>
      <ref id="ref21">
        <label>21</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Singhal</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Azizi</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Tu</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Mahdavi</surname>
              <given-names>SS</given-names>
            </name>
            <name name-style="western">
              <surname>Wei</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Chung</surname>
              <given-names>HW</given-names>
            </name>
            <name name-style="western">
              <surname>Scales</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Tanwani</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Cole-Lewis</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Pfohl</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Payne</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Seneviratne</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Gamble</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Kelly</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Babiker</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Schärli</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Chowdhery</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Mansfield</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Demner-Fushman</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Agüera Y Arcas</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Webster</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Corrado</surname>
              <given-names>GS</given-names>
            </name>
            <name name-style="western">
              <surname>Matias</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Chou</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Gottweis</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Tomasev</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Rajkomar</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Barral</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Semturs</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Karthikesalingam</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Natarajan</surname>
              <given-names>V</given-names>
            </name>
          </person-group>
          <article-title>Large language models encode clinical knowledge</article-title>
          <source>Nature</source>
          <year>2023</year>
          <volume>620</volume>
          <issue>7972</issue>
          <fpage>172</fpage>
          <lpage>180</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/37438534"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/s41586-023-06291-2</pub-id>
          <pub-id pub-id-type="medline">37438534</pub-id>
          <pub-id pub-id-type="pii">10.1038/s41586-023-06291-2</pub-id>
          <pub-id pub-id-type="pmcid">PMC10396962</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref22">
        <label>22</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Singhal</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Tu</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Gottweis</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Sayres</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Wulczyn</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Amin</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Hou</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Clark</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Pfohl</surname>
              <given-names>SR</given-names>
            </name>
            <name name-style="western">
              <surname>Cole-Lewis</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Neal</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Rashid</surname>
              <given-names>QM</given-names>
            </name>
            <name name-style="western">
              <surname>Schaekermann</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Dash</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>JH</given-names>
            </name>
            <name name-style="western">
              <surname>Shah</surname>
              <given-names>NH</given-names>
            </name>
            <name name-style="western">
              <surname>Lachgar</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Mansfield</surname>
              <given-names>PA</given-names>
            </name>
            <name name-style="western">
              <surname>Prakash</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Green</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Dominowska</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Agüera Y Arcas</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Tomašev</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Wong</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Semturs</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Mahdavi</surname>
              <given-names>SS</given-names>
            </name>
            <name name-style="western">
              <surname>Barral</surname>
              <given-names>JK</given-names>
            </name>
            <name name-style="western">
              <surname>Webster</surname>
              <given-names>DR</given-names>
            </name>
            <name name-style="western">
              <surname>Corrado</surname>
              <given-names>GS</given-names>
            </name>
            <name name-style="western">
              <surname>Matias</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Azizi</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Karthikesalingam</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Natarajan</surname>
              <given-names>V</given-names>
            </name>
          </person-group>
          <article-title>Toward expert-level medical question answering with large language models</article-title>
          <source>Nat Med</source>
          <year>2025</year>
          <pub-id pub-id-type="doi">10.1038/s41591-024-03423-7</pub-id>
          <pub-id pub-id-type="medline">39779926</pub-id>
          <pub-id pub-id-type="pii">10.1038/s41591-024-03423-7</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref23">
        <label>23</label>
        <nlm-citation citation-type="web">
          <article-title>HIPAA for professionals</article-title>
          <source>US Department of Health and Human Services</source>
          <access-date>2024-11-25</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.hhs.gov/hipaa/for-professionals/index.html">https://www.hhs.gov/hipaa/for-professionals/index.html</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref24">
        <label>24</label>
        <nlm-citation citation-type="web">
          <article-title>The general data protection regulation</article-title>
          <source>European Council and Council of the European Union</source>
          <access-date>2024-11-25</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.consilium.europa.eu/en/policies/data-protection/data-protection-regulation/">https://www.consilium.europa.eu/en/policies/data-protection/data-protection-regulation/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref25">
        <label>25</label>
        <nlm-citation citation-type="web">
          <article-title>Chat Markup Language ChatML (Preview)</article-title>
          <source>Microsoft</source>
          <access-date>2024-11-25</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://learn.microsoft.com/en-us/azure/ai-services/openai/how-to/chat-markup-language">https://learn.microsoft.com/en-us/azure/ai-services/openai/how-to/chat-markup-language</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref26">
        <label>26</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hendrycks</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Burns</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Basart</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Zou</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Mazeika</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Song</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Steinhardt</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Measuring massive multitask language understanding</article-title>
          <source>ArXiv</source>
          <comment>Preprint posted online on September 7, 2020</comment>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/2009.03300v3"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref27">
        <label>27</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Jin</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Dhingra</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Cohen</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Lu</surname>
              <given-names>X</given-names>
            </name>
          </person-group>
          <article-title>PubMedQA: a dataset for biomedical research question answering</article-title>
          <year>2019</year>
          <conf-name>Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP)</conf-name>
          <conf-date>November 3-7, 2019</conf-date>
          <conf-loc>Hong Kong, China</conf-loc>
          <fpage>2567</fpage>
          <lpage>2577</lpage>
          <pub-id pub-id-type="doi">10.18653/v1/d19-1259</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref28">
        <label>28</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Pal</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Umapathi</surname>
              <given-names>LK</given-names>
            </name>
            <name name-style="western">
              <surname>Sankarasubbu</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>MedMCQA: A large-scale multi-subject multi-choice dataset for medical domain question answering</article-title>
          <source>Proc Conf Health Inference Learn</source>
          <year>2022</year>
          <fpage>248</fpage>
          <lpage>260</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://proceedings.mlr.press/v174/pal22a.html"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref29">
        <label>29</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Jin</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Pan</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Oufattole</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Weng</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Fang</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Szolovits</surname>
              <given-names>P</given-names>
            </name>
          </person-group>
          <article-title>What disease does this patient have? A large-scale open domain question answering dataset from medical exams</article-title>
          <source>Appl Sci</source>
          <year>2021</year>
          <volume>11</volume>
          <issue>14</issue>
          <fpage>6421</fpage>
          <pub-id pub-id-type="doi">10.3390/app11146421</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref30">
        <label>30</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Johnson</surname>
              <given-names>AEW</given-names>
            </name>
            <name name-style="western">
              <surname>Bulgarelli</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Shen</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Gayles</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Shammout</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Horng</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Pollard</surname>
              <given-names>TJ</given-names>
            </name>
            <name name-style="western">
              <surname>Hao</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Moody</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Gow</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Lehman</surname>
              <given-names>LH</given-names>
            </name>
            <name name-style="western">
              <surname>Celi</surname>
              <given-names>LA</given-names>
            </name>
            <name name-style="western">
              <surname>Mark</surname>
              <given-names>RG</given-names>
            </name>
          </person-group>
          <article-title>MIMIC-IV, a freely accessible electronic health record dataset</article-title>
          <source>Sci Data</source>
          <year>2023</year>
          <volume>10</volume>
          <issue>1</issue>
          <fpage>1</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1038/s41597-022-01899-x"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/s41597-022-01899-x</pub-id>
          <pub-id pub-id-type="medline">36596836</pub-id>
          <pub-id pub-id-type="pii">10.1038/s41597-022-01899-x</pub-id>
          <pub-id pub-id-type="pmcid">PMC9810617</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref31">
        <label>31</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Schulhoff</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Ilie</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Balepur</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Kahadze</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Si</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Gupta</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Han</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Schulhoff</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Dulepet</surname>
              <given-names>PS</given-names>
            </name>
            <name name-style="western">
              <surname>Vidyadhara</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Ki</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Agrawal</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Pham</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Kroiz</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Tao</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>The Prompt Report: a systematic survey of prompting techniques</article-title>
          <source>ArXiv</source>
          <comment>Preprint posted online on June 6, 2024</comment>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://arxiv.org/abs/2406.06608"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref32">
        <label>32</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Zhou</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Deepmind</surname>
              <given-names>G</given-names>
            </name>
          </person-group>
          <article-title>Chain-of-thought reasoning without prompting</article-title>
          <source>ArXiv</source>
          <comment>Preprint posted online on February 15, 2024</comment>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/2402.10200v2"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref33">
        <label>33</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wei</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Schuurmans</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Bosma</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Ichter</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Xia</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Chi</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Le</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Zhou</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>Chain-of-thought prompting elicits reasoning in large language models</article-title>
          <source>ArXiv</source>
          <comment>Preprint posted online on January 28, 2022</comment>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/2201.11903v6"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref34">
        <label>34</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <collab>Anthropic</collab>
          </person-group>
          <article-title>The Claude 3 Model Family: Opus, Sonnet, Haiku</article-title>
          <source>Anthropic</source>
          <year>2024</year>
          <access-date>2025-01-26</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www-cdn.anthropic.com/de8ba9b01c9ab7cbabf5c33b80b7bbc618857627/Model_Card_Claude_3.pdf">https://www-cdn.anthropic.com/de8ba9b01c9ab7cbabf5c33b80b7bbc618857627/Model_Card_Claude_3.pdf</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref35">
        <label>35</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <collab>OpenAI</collab>
            <name name-style="western">
              <surname>Achiam</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Adler</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Agarwal</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Ahmad</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Akkaya</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Aleman</surname>
              <given-names>FL</given-names>
            </name>
            <name name-style="western">
              <surname>Almeida</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Altenschmidt,</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Altman</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Anadkat</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Avila</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Babuschkin</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Balaji</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Balcom</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Baltescu</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Bao</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Bavarian</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Belgum</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Bello</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Berdine</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Bernadett-Shapiro</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Berner</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Bogdonoff</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Boiko</surname>
              <given-names>O</given-names>
            </name>
            <name name-style="western">
              <surname>Boyd</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Brakman</surname>
              <given-names>AL</given-names>
            </name>
            <name name-style="western">
              <surname>Brockman</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Brooks</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Brundage</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Button</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Cai</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Campbell</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Andrew Cann</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>GPT-4 technical report</article-title>
          <source>ArXiv</source>
          <comment>Preprint posted online on March 15, 2023</comment>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/2303.08774v6"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref36">
        <label>36</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Piantella</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Reali</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Kumar</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Tanca</surname>
              <given-names>L</given-names>
            </name>
          </person-group>
          <article-title>A minimum metadataset for data lakes supporting healthcare research</article-title>
          <year>2024</year>
          <month>06</month>
          <day>23</day>
          <conf-name>SEBD 2024: 32nd Symposium on Advanced Database Systems</conf-name>
          <conf-date>June 23-26, 2024</conf-date>
          <conf-loc>Villasimius, Sardinia, Italy</conf-loc>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://sebd2024.unica.it/papers/paper47.pdf"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref37">
        <label>37</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>AlSaad</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Abd-Alrazaq</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Boughorbel</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Ahmed</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Renault</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Damseh</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Sheikh</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Multimodal large language models in health care: applications, challenges, and future outlook</article-title>
          <source>J Med Internet Res</source>
          <year>2024</year>
          <volume>26</volume>
          <fpage>e59505</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.jmir.org/2024//e59505/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/59505</pub-id>
          <pub-id pub-id-type="medline">39321458</pub-id>
          <pub-id pub-id-type="pii">v26i1e59505</pub-id>
          <pub-id pub-id-type="pmcid">PMC11464944</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref38">
        <label>38</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Xu</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Jain</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Kankanhalli</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Hallucination is inevitable: an innate limitation of large language models</article-title>
          <source>ArXiv</source>
          <comment>Preprint posted online on January 22, 2024</comment>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/2401.11817v1"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref39">
        <label>39</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Bommasani</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Liang</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>T</given-names>
            </name>
          </person-group>
          <article-title>Holistic evaluation of language models</article-title>
          <source>Ann N Y Acad Sci</source>
          <year>2023</year>
          <volume>1525</volume>
          <issue>1</issue>
          <fpage>140</fpage>
          <lpage>146</lpage>
          <pub-id pub-id-type="doi">10.1111/nyas.15007</pub-id>
          <pub-id pub-id-type="medline">37230490</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref40">
        <label>40</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ong</surname>
              <given-names>JCL</given-names>
            </name>
            <name name-style="western">
              <surname>Chang</surname>
              <given-names>SYH</given-names>
            </name>
            <name name-style="western">
              <surname>William</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Butte</surname>
              <given-names>AJ</given-names>
            </name>
            <name name-style="western">
              <surname>Shah</surname>
              <given-names>NH</given-names>
            </name>
            <name name-style="western">
              <surname>Chew</surname>
              <given-names>LST</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Doshi-Velez</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Lu</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Savulescu</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Ting</surname>
              <given-names>DSW</given-names>
            </name>
          </person-group>
          <article-title>Ethical and regulatory challenges of large language models in medicine</article-title>
          <source>Lancet Digit Health</source>
          <year>2024</year>
          <volume>6</volume>
          <issue>6</issue>
          <fpage>e428</fpage>
          <lpage>e432</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://linkinghub.elsevier.com/retrieve/pii/S2589-7500(24)00061-X"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/S2589-7500(24)00061-X</pub-id>
          <pub-id pub-id-type="medline">38658283</pub-id>
          <pub-id pub-id-type="pii">S2589-7500(24)00061-X</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref41">
        <label>41</label>
        <nlm-citation citation-type="web">
          <article-title>Ethics and governance of artificial intelligence for health</article-title>
          <source>World Health Organization</source>
          <access-date>2021-06-28</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.who.int/publications/i/item/9789240029200">https://www.who.int/publications/i/item/9789240029200</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref42">
        <label>42</label>
        <nlm-citation citation-type="web">
          <source>Ragas</source>
          <access-date>2025-01-26</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://docs.ragas.io/en/stable/">https://docs.ragas.io/en/stable/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref43">
        <label>43</label>
        <nlm-citation citation-type="web">
          <source>DeepEval</source>
          <access-date>2025-01-26</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://docs.confident-ai.com/">https://docs.confident-ai.com/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref44">
        <label>44</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Smith</surname>
              <given-names>CM</given-names>
            </name>
          </person-group>
          <article-title>Origin and uses of primum non nocere—above all, do no harm!</article-title>
          <source>J Clin Pharm</source>
          <year>2013</year>
          <volume>45</volume>
          <issue>4</issue>
          <fpage>371</fpage>
          <lpage>377</lpage>
          <pub-id pub-id-type="doi">10.1177/0091270004273680</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref45">
        <label>45</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Sun</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Sheng</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Zhou</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>Y</given-names>
            </name>
          </person-group>
          <article-title>AI hallucination: towards a comprehensive classification of distorted information in artificial intelligence-generated content</article-title>
          <source>Humanit Soc Sci Commun</source>
          <year>2024</year>
          <volume>11</volume>
          <issue>1</issue>
          <fpage>1</fpage>
          <lpage>14</lpage>
          <pub-id pub-id-type="doi">10.1057/s41599-024-03811-x</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref46">
        <label>46</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Maleki</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Padmanabhan</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Dutta</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>AI hallucinations: a misnomer worth clarifying</article-title>
          <source>ArXiv</source>
          <comment>Preprint posted online on January 9, 2024</comment>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://arxiv.org/abs/2401.06796"/>
          </comment>
          <pub-id pub-id-type="doi">10.1109/cai59869.2024.00033</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref47">
        <label>47</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hatem</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Simmons</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Thornton</surname>
              <given-names>JE</given-names>
            </name>
          </person-group>
          <article-title>A call to address AI 'hallucinations' and how healthcare professionals can mitigate their risks</article-title>
          <source>Cureus</source>
          <year>2023</year>
          <volume>15</volume>
          <issue>9</issue>
          <fpage>e44720</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/37809168"/>
          </comment>
          <pub-id pub-id-type="doi">10.7759/cureus.44720</pub-id>
          <pub-id pub-id-type="medline">37809168</pub-id>
          <pub-id pub-id-type="pmcid">PMC10552880</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref48">
        <label>48</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Chauhan</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Bongo</surname>
              <given-names>LA</given-names>
            </name>
            <name name-style="western">
              <surname>Pedersen</surname>
              <given-names>E</given-names>
            </name>
          </person-group>
          <article-title>Ethical challenges of using synthetic data</article-title>
          <year>2023</year>
          <conf-name>Proceedings of the AAAI Symposium Series AAAI Press</conf-name>
          <conf-date>July 17–19, 2023</conf-date>
          <conf-loc>Washington, DC</conf-loc>
          <fpage>133</fpage>
          <lpage>134</lpage>
          <pub-id pub-id-type="doi">10.1609/aaaiss.v1i1.27490</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref49">
        <label>49</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Adane</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Gizachew</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Kendie</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>The role of medical data in efficient patient care delivery: a review</article-title>
          <source>Risk Manag Healthc Policy</source>
          <year>2019</year>
          <volume>12</volume>
          <fpage>67</fpage>
          <lpage>73</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.tandfonline.com/doi/abs/10.2147/RMHP.S179259?url_ver=Z39.88-2003&#38;rfr_id=ori:rid:crossref.org&#38;rfr_dat=cr_pub  0pubmed"/>
          </comment>
          <pub-id pub-id-type="doi">10.2147/RMHP.S179259</pub-id>
          <pub-id pub-id-type="medline">31114410</pub-id>
          <pub-id pub-id-type="pii">rmhp-12-067</pub-id>
          <pub-id pub-id-type="pmcid">PMC6486797</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref50">
        <label>50</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Mo</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Lu</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Xu</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Tang</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>He</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Ma</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Huang</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>A comprehensive survey of small language models in the era of large language models: techniques, enhancements, applications, collaboration with LLMs, and trustworthiness</article-title>
          <source>ArXiv</source>
          <comment>Preprint posted online on November 4, 2024</comment>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://arxiv.org/abs/2411.03350"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref51">
        <label>51</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Chute</surname>
              <given-names>CG</given-names>
            </name>
          </person-group>
          <article-title>Clinical classification and terminology: some history and current observations</article-title>
          <source>J Am Med Inform Assoc</source>
          <year>2000</year>
          <volume>7</volume>
          <issue>3</issue>
          <fpage>298</fpage>
          <lpage>303</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/10833167"/>
          </comment>
          <pub-id pub-id-type="doi">10.1136/jamia.2000.0070298</pub-id>
          <pub-id pub-id-type="medline">10833167</pub-id>
          <pub-id pub-id-type="pmcid">PMC61433</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref52">
        <label>52</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Buttazzo</surname>
              <given-names>G</given-names>
            </name>
          </person-group>
          <article-title>Rise of artificial general intelligence: risks and opportunities</article-title>
          <source>Front Artif Intell</source>
          <year>2023</year>
          <volume>6</volume>
          <fpage>1226990</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/37693010"/>
          </comment>
          <pub-id pub-id-type="doi">10.3389/frai.2023.1226990</pub-id>
          <pub-id pub-id-type="medline">37693010</pub-id>
          <pub-id pub-id-type="pmcid">PMC10485377</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref53">
        <label>53</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Fei</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Lu</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Gao</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Huo</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Wen</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Lu</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Song</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Gao</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Xiang</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Sun</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Wen</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Towards artificial general intelligence via a multimodal foundation model</article-title>
          <source>Nat Commun</source>
          <year>2022</year>
          <volume>13</volume>
          <issue>1</issue>
          <fpage>3094</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1038/s41467-022-30761-2"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/s41467-022-30761-2</pub-id>
          <pub-id pub-id-type="medline">35655064</pub-id>
          <pub-id pub-id-type="pii">10.1038/s41467-022-30761-2</pub-id>
          <pub-id pub-id-type="pmcid">PMC9163040</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref54">
        <label>54</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>van Rooij</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Guest</surname>
              <given-names>O</given-names>
            </name>
            <name name-style="western">
              <surname>Adolfi</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>de Haan</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Kolokolova</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Rich</surname>
              <given-names>P</given-names>
            </name>
          </person-group>
          <article-title>Reclaiming AI as a theoretical tool for cognitive science</article-title>
          <source>Comput Brain Behav</source>
          <year>2024</year>
          <volume>7</volume>
          <issue>4</issue>
          <fpage>616</fpage>
          <lpage>636</lpage>
          <pub-id pub-id-type="doi">10.1007/s42113-024-00217-5</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref55">
        <label>55</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Fjelland</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>Why general artificial intelligence will not be realized</article-title>
          <source>Humanit Soc Sci Commun</source>
          <year>2020</year>
          <volume>7</volume>
          <issue>1</issue>
          <fpage>1</fpage>
          <lpage>9</lpage>
          <pub-id pub-id-type="doi">10.1057/s41599-020-0494-4</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref56">
        <label>56</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zhong</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Pan</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Zhou</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Liang</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Lyu</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Shu</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Yu</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Cao</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Jiang</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Hu</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Zhao</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Xu</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Dai</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Zhao</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Zhao</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Ruan</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Zhao</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Ren</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Qin</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Evaluation of OpenAI o1: opportunities and challenges of AGI</article-title>
          <source>ArXiv</source>
          <comment>Preprint posted online on September 27, 2024</comment>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/2409.18486v1"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref57">
        <label>57</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Schwartz</surname>
              <given-names>DA</given-names>
            </name>
          </person-group>
          <article-title>Physician-scientists: the bridge between medicine and science</article-title>
          <source>Am J Respir Crit Care Med</source>
          <year>2012</year>
          <volume>185</volume>
          <issue>6</issue>
          <fpage>595</fpage>
          <lpage>596</lpage>
          <pub-id pub-id-type="doi">10.1164/rccm.201110-1806ED</pub-id>
          <pub-id pub-id-type="medline">22422900</pub-id>
          <pub-id pub-id-type="pii">185/6/595</pub-id>
        </nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
