<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">J Med Internet Res</journal-id><journal-id journal-id-type="publisher-id">jmir</journal-id><journal-id journal-id-type="index">1</journal-id><journal-title>Journal of Medical Internet Research</journal-title><abbrev-journal-title>J Med Internet Res</abbrev-journal-title><issn pub-type="epub">1438-8871</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v27i1e76048</article-id><article-id pub-id-type="doi">10.2196/76048</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Fine-Tuning Methods for Large Language Models in Clinical Medicine by Supervised Fine-Tuning and Direct Preference Optimization: Comparative Evaluation</article-title></title-group><contrib-group><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Savage</surname><given-names>Thomas</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>P Ma</surname><given-names>Stephen</given-names></name><degrees>MD, PhD</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Boukil</surname><given-names>Abdessalem</given-names></name><degrees>BS</degrees><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Rangan</surname><given-names>Ekanath</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff4">4</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Patel</surname><given-names>Vishwesh</given-names></name><degrees>MBBS</degrees><xref ref-type="aff" rid="aff5">5</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Lopez</surname><given-names>Ivan</given-names></name><degrees>BS</degrees><xref ref-type="aff" rid="aff4">4</xref><xref ref-type="aff" rid="aff6">6</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Chen</surname><given-names>Jonathan</given-names></name><degrees>MD, PhD</degrees><xref ref-type="aff" rid="aff2">2</xref><xref ref-type="aff" rid="aff6">6</xref><xref ref-type="aff" rid="aff7">7</xref><xref ref-type="aff" rid="aff8">8</xref></contrib></contrib-group><aff id="aff1"><institution>Division of Hospital Medicine, Perelman School of Medicine, Department of Medicine, University of Pennsylvania</institution><addr-line>3400 Spruce St</addr-line><addr-line>Philadelphia</addr-line><addr-line>PA</addr-line><country>United States</country></aff><aff id="aff2"><institution>Division of Hospital Medicine, Department of Medicine, Stanford Medicine</institution><addr-line>Palo Alto</addr-line><addr-line>CA</addr-line><country>United States</country></aff><aff id="aff3"><institution>Linguamind AI</institution><addr-line>Sousse</addr-line><country>Tunisia</country></aff><aff id="aff4"><institution>Department of Medicine, Stanford Medicine</institution><addr-line>Palo Alto</addr-line><addr-line>CA</addr-line><country>United States</country></aff><aff id="aff5"><institution>Department of Medicine, Saint Michael&#x2019;s Medical Center</institution><addr-line>Newark</addr-line><addr-line>New Jersey</addr-line><country>United States</country></aff><aff id="aff6"><institution>Center for Biomedical Informatics Research, Stanford University</institution><addr-line>Palo Alto</addr-line><addr-line>CA</addr-line><country>United States</country></aff><aff id="aff7"><institution>Stanford Center for Biomedical Informatics Research</institution><addr-line>Palo Alto</addr-line><addr-line>CA</addr-line><country>United States</country></aff><aff id="aff8"><institution>Clinical Excellence Research Center, Stanford University</institution><addr-line>Palo Alto</addr-line><addr-line>CA</addr-line><country>United States</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Coristine</surname><given-names>Andrew</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Immanuvel Arockiasamy</surname><given-names>Jesu Marcus</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Potla</surname><given-names>Ravi Teja</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Thomas Savage, MD, Division of Hospital Medicine, Perelman School of Medicine, Department of Medicine, University of Pennsylvania, 3400 Spruce St, Philadelphia, PA, 19147, United States, 1 2155191670; <email>thomas.savage@pennmedicine.upenn.edu</email></corresp></author-notes><pub-date pub-type="collection"><year>2025</year></pub-date><pub-date pub-type="epub"><day>23</day><month>9</month><year>2025</year></pub-date><volume>27</volume><elocation-id>e76048</elocation-id><history><date date-type="received"><day>15</day><month>04</month><year>2025</year></date><date date-type="rev-recd"><day>29</day><month>07</month><year>2025</year></date><date date-type="accepted"><day>31</day><month>07</month><year>2025</year></date></history><copyright-statement>&#x00A9; Thomas Savage, Stephen P Ma, Abdessalem Boukil, Ekanath Rangan, Vishwesh Patel, Ivan Lopez, Jonathan Chen. Originally published in the Journal of Medical Internet Research (<ext-link ext-link-type="uri" xlink:href="https://www.jmir.org">https://www.jmir.org</ext-link>), 23.9.2025. </copyright-statement><copyright-year>2025</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in the Journal of Medical Internet Research (ISSN 1438-8871), is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://www.jmir.org/">https://www.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://www.jmir.org/2025/1/e76048"/><abstract><sec><title>Background</title><p>Large language model (LLM) fine-tuning is the process of adjusting out-of-the-box model weights using a dataset of interest. Fine-tuning can be a powerful technique to improve model performance in fields like medicine, where LLMs may have poor out-of-the-box performance. The 2 common fine-tuning techniques are supervised fine-tuning (SFT) and direct preference optimization (DPO); however, little guidance is available for when to apply either method within clinical medicine or health care operations.</p></sec><sec><title>Objective</title><p>This study aims to investigate the benefits of fine-tuning with SFT and DPO across a range of core natural language tasks in medicine to better inform clinical informaticists when either technique should be deployed.</p></sec><sec sec-type="methods"><title>Methods</title><p>We use Llama3 8B (Meta) and Mistral 7B v2 (Mistral AI) to compare the performance of SFT alone and DPO across 4 common natural language tasks in medicine. The tasks we evaluate include text classification, clinical reasoning, text summarization, and clinical triage.</p></sec><sec sec-type="results"><title>Results</title><p>Our results found clinical reasoning accuracy increased from 7% to 22% with base Llama3 and Mistral2, respectively, to 28% and 33% with SFT, and then 36% and 40% with DPO (<italic>P</italic>=.003 and <italic>P</italic>=.004, respectively). Summarization quality, graded on a 5-point Likert scale, was 4.11 with base Llama3 and 3.93 with base Mistral2. Performance increased to 4.21 and 3.98 with SFT and then 4.34 and 4.08 with DPO (<italic>P</italic>&#x003C;.001). <italic>F</italic><sub>1</sub>-scores for provider triage were 0.55 for Llama3 and 0.49 for Mistral2, which increased to 0.58 and 0.52 with SFT and 0.74 and 0.66 with DPO (<italic>P</italic>&#x003C;.001). <italic>F</italic><sub>1</sub>-scores for urgency triage were 0.81 for Llama3 and 0.88 for Mistral2, which decreased with SFT to 0.79 and 0.87, and then experienced mixed results with DPO, achieving 0.91 and 0.85, respectively (<italic>P</italic>&#x003C;.001 and <italic>P</italic>&#x003E;.99, respectively). Finally, <italic>F</italic><sub>1</sub>-scores for text classification were 0.63 for Llama3 and 0.73 for Mistral2, which increased to 0.98 and 0.97 with SFT, and then essentially did not change with DPO to 0.95 and 0.97, respectively (<italic>P</italic>=.55 and <italic>P</italic>&#x003E;.99, respectively). DPO fine-tuning required approximately 2 to 3 times more compute resources than SFT alone.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>SFT alone is sufficient for simple tasks such as rule-based text classification, while DPO after SFT improves performance on the more complex tasks of triage, clinical reasoning, and summarization. We postulate that SFT alone is sufficient for simple tasks because SFT strengthens simple word-association reasoning, whereas DPO enables deeper comprehension because it is trained with both positive and negative examples, enabling the model to recognize more complex patterns. Ultimately, our results help inform clinical informaticists when to deploy either fine-tuning method and encourage commercial LLM providers to offer DPO fine-tuning for commonly used proprietary LLMs in medicine.</p></sec></abstract><kwd-group><kwd>artificial intelligence</kwd><kwd>direct preference optimization</kwd><kwd>supervised fine-tuning</kwd><kwd>fine-tuning</kwd><kwd>large language models</kwd></kwd-group></article-meta></front><body><sec id="s2" sec-type="intro"><title>Introduction</title><sec id="s1-1"><title>Overview</title><p>Large language models (LLMs) have sparked considerable interest in the medical field, offering potential for transformative clinical and operational applications [<xref ref-type="bibr" rid="ref1">1</xref>-<xref ref-type="bibr" rid="ref3">3</xref>]. However, to be effectively deployed in health care settings, these models often require additional refinement. While prompt engineering is a commonly used strategy for tailoring model behavior [<xref ref-type="bibr" rid="ref4">4</xref>], it is not sufficient for all tasks. In cases where prompt engineering falls short, fine-tuning provides a more robust approach to adapt LLMs to specific medical use cases.</p><p>Fine-tuning is the process of adjusting the coefficient weights of a language model after pretraining, adapting the model with a subject-specific dataset of interest to the user [<xref ref-type="bibr" rid="ref5">5</xref>-<xref ref-type="bibr" rid="ref8">8</xref>]. To date, few LLM applications in medicine have deployed fine-tuning. In turn, there is a scarcity of literature informing users about which natural language processing (NLP) tasks benefit from LLM fine-tuning and, for those that benefit, which specific fine-tuning methods should be deployed. Therefore, in this study, we quantify the benefits of 2 common fine-tuning techniques, supervised fine tuning (SFT) and direct preference optimization (DPO), across a few key elementary tasks in clinical NLP.</p></sec><sec id="s1-2"><title>Background</title><p>SFT has been the conventional method of fine-tuning a language model. SFT requires the user to provide example prompts and desirable reference responses. SFT uses a classic loss function to adjust model weights and maximize the probability that the model will reproduce similar gold standard responses [<xref ref-type="bibr" rid="ref9">9</xref>]. In many ways, SFT is simply training the model to mimic reference responses.</p><p>DPO is a variation of reinforcement learning that has become a popular fine-tuning technique because of its stability when training with smaller datasets [<xref ref-type="bibr" rid="ref10">10</xref>]. In contrast to SFT, DPO requires the user to provide not only prompts and gold standard responses but also &#x201C;rejected&#x201D; (meaning less preferred) responses that the user finds undesirable. The use of rejected responses for fine-tuning is the key difference between SFT and DPO because DPO adjusts model weights to both maximize the likelihood of desired responses and minimize the likelihood of less preferred &#x201C;rejected&#x201D; responses. This conceptual difference is reflected in the DPO loss function (<xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>) [<xref ref-type="bibr" rid="ref5">5</xref>,<xref ref-type="bibr" rid="ref9">9</xref>,<xref ref-type="bibr" rid="ref10">10</xref>]. DPO is typically used on a model that has already undergone SFT fine-tuning.</p><p>When to use DPO is an area of active investigation. DPO is described as providing better alignment with human preferences, but recent publications have highlighted the ambiguity of this description [<xref ref-type="bibr" rid="ref9">9</xref>]. It is unknown whether better alignment translates to better reasoning, summarization, information retrieval, or other tasks of importance to clinicians. Overall, few studies have compared SFT with DPO for individual NLP tasks important to medicine [<xref ref-type="bibr" rid="ref5">5</xref>].</p><p>To address these gaps, our study aims to test key clinical NLP tasks for benefit from SFT and DPO fine-tuning. Specifically, we evaluate simple classification, clinical reasoning, text summarization, and clinical triage&#x2014;areas where enhanced language model capabilities could meaningfully support medical decision-making.</p></sec></sec><sec id="s3" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Overview</title><p>We compared SFT and DPO on 4 datasets, each evaluating a core clinical NLP task. A glossary of terms is provided in <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>. We performed our investigation on 2 popular open-source LLMs, Llama3-8B-Instruct [<xref ref-type="bibr" rid="ref11">11</xref>] and Mistral-Instruct-v2 [<xref ref-type="bibr" rid="ref12">12</xref>], using datasets of fewer than 5000 training examples.</p><p>Each dataset consisted of a training, evaluation, development, and test set. The base LLM model was first fine-tuned via SFT using the training and evaluation datasets, and then the development dataset was used to select the top-performing SFT model. The top-performing SFT model was then used as the base model for DPO fine-tuning. DPO was then performed using the train and evaluation datasets, and the top-performing DPO model was selected using the development set. Finally, the base LLM, top-performing SFT model, and top-performing DPO model were compared using the test set. This evaluation process is illustrated in <xref ref-type="fig" rid="figure1">Figure 1</xref>.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Overview of the methods used to fine-tune the SFT and DPO models, as well as compare the fine-tuned models with the base large language model. DPO: direct preference optimization; LLM: large language model; SFT: supervised fine-tuning.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v27i1e76048_fig01.png"/></fig></sec><sec id="s2-2"><title>Elementary Tasks Evaluated</title><p>The 4 elemental NLP tasks of interest were selected for evaluation from the systematic review by Bedi et al [<xref ref-type="bibr" rid="ref2">2</xref>]: simple classification, clinical reasoning, text summarization, and triage. Bedi et al [<xref ref-type="bibr" rid="ref2">2</xref>] completed a review of 519 studies that used LLMs for medical applications and grouped them by overall task to identify how LLMs are used in clinical practice. From that list of tasks compiled by Bedi, we selected the tasks most likely to benefit from fine-tuning for inclusion in our study.</p><p>These 4 tasks reflect key functions that clinicians frequently perform in real-world settings. Simple classification is used to categorize clinical notes for purposes such as billing, quality reporting, or operational workflows [<xref ref-type="bibr" rid="ref1">1</xref>,<xref ref-type="bibr" rid="ref13">13</xref>]. Clinical reasoning tasks require the model to interpret clinical information&#x2014;such as patient histories or provider notes&#x2014;and generate diagnostic assessments or treatment recommendations [<xref ref-type="bibr" rid="ref14">14</xref>-<xref ref-type="bibr" rid="ref16">16</xref>]. Summarization helps clinicians condense lengthy documentation into concise, high-yield summaries to support faster chart review [<xref ref-type="bibr" rid="ref17">17</xref>]. Finally, triage tasks apply abstract, nonexplicit criteria to determine case prioritization, such as identifying patients who need urgent evaluation or allocating limited resources in emergency or ambulatory care settings [<xref ref-type="bibr" rid="ref18">18</xref>].</p><p>Below we describe our methods used to evaluate each task. <xref ref-type="table" rid="table1">Table 1</xref> provides additional details on the dataset used for each task.</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Description of the NLP tasks evaluated and the corresponding dataset, gold standard answer, and rejected answer. The same datasets and preferred samples were used for both SFT and DPO. All datasets (except for patient message triage) are provided in <xref ref-type="supplementary-material" rid="app3">Multimedia Appendices 3</xref><xref ref-type="supplementary-material" rid="app4"/><xref ref-type="supplementary-material" rid="app5"/><xref ref-type="supplementary-material" rid="app6"/>-<xref ref-type="supplementary-material" rid="app7">7</xref>.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Tasks</td><td align="left" valign="bottom">Description</td><td align="left" valign="bottom">Clinical scenario tested</td><td align="left" valign="bottom">Dataset</td><td align="left" valign="bottom">Preferred sample</td><td align="left" valign="bottom">Rejected sample</td></tr></thead><tbody><tr><td align="left" valign="top">Simple classification</td><td align="left" valign="top">Recognize a strict text-based criterion to classify a passage into one of multiple groups.</td><td align="left" valign="top">Identify passages describing patients with a UTI<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup> (pyuria with lower urinary tract symptoms) versus only pyuria.</td><td align="left" valign="top">Total dataset size: 700<break/>patient scenarios were generated by GPT-4 [<xref ref-type="bibr" rid="ref19">19</xref>] and then edited by 3 board-certified physicians for accuracy and to provide sufficient data variability.</td><td align="left" valign="top">Diagnosis by a board-certified physician.</td><td align="left" valign="top">Incorrect diagnosis not selected by grading physician.</td></tr><tr><td align="left" valign="top">Clinical triage</td><td align="left" valign="top">Recognize an abstract criterion to classify a passage into one of multiple groups.</td><td align="left" valign="top">Triage patient messages for both the appropriate urgency of response (urgent or nonurgent) and appropriate responding provider (physician or medical assistant).</td><td align="left" valign="top">Total dataset size: 1800 outpatient clinic patient messages from Stanford Health Care triaged by physician author TRS according to criteria listed in <xref ref-type="supplementary-material" rid="app7">Multimedia Appendix 7</xref>.</td><td align="left" valign="top">Appropriate triage as determined by the grading physician (author TRS).</td><td align="left" valign="top">Incorrect triage not selected by the grading physician.</td></tr><tr><td align="left" valign="top">Clinical reasoning</td><td align="left" valign="top">Interpret patient information to identify diagnoses and select treatments.</td><td align="left" valign="top">Medical board exam questions evaluating the skills of clinical diagnosis and treatment selection.</td><td align="left" valign="top">Total dataset size: 5161<break/>MedQA dataset [<xref ref-type="bibr" rid="ref20">20</xref>], modified to questions evaluating clinical diagnosis and treatment selection at the step 2 and 3 level [<xref ref-type="bibr" rid="ref21">21</xref>,<xref ref-type="bibr" rid="ref22">22</xref>].</td><td align="left" valign="top">Correct answer provided by the MedQA dataset.</td><td align="left" valign="top">Randomly selected incorrect multiple-choice option provided by the MedQA dataset.</td></tr><tr><td align="left" valign="top">Summarization</td><td align="left" valign="top">Identify key information in a passage for a target audience.</td><td align="left" valign="top">Summarize a discharge summary note into 2&#x2010;3 sentences for an internal medicine physician.</td><td align="left" valign="top">Total dataset size: 5250 synthetic discharge notes from the AISC Augmented Clinical Notes dataset [<xref ref-type="bibr" rid="ref23">23</xref>].</td><td align="left" valign="top">GPT-4 [<xref ref-type="bibr" rid="ref19">19</xref>]&#x2013;generated summaries.</td><td align="left" valign="top">Llama2 [<xref ref-type="bibr" rid="ref24">24</xref>]&#x2013;generated summaries.</td></tr></tbody></table><table-wrap-foot><fn id="table1fn1"><p><sup>a</sup>UTI: urinary tract infection</p></fn></table-wrap-foot></table-wrap></sec><sec id="s2-3"><title>Simple Classification</title><p>The first elementary task evaluated was simple classification, where we asked models to identify passages describing patients with a possible urinary tract infection (UTI). To be classified as a UTI, the passage needed to describe both pyuria and lower urinary tract symptoms [<xref ref-type="bibr" rid="ref25">25</xref>,<xref ref-type="bibr" rid="ref26">26</xref>].</p><p>The dataset was generated by GPT-4, which was prompted to generate 400 cases describing pyuria with no symptoms and 400 cases describing pyuria with urinary symptoms (positive for UTI). The 3 physician annotators then reviewed the generated cases to ensure correctness and introduce sufficient variability among the examples. The 800 examples were then split into a training set (300 examples), evaluation set (200 examples), development set (100 examples), and test set (200 examples). Prompts, patient descriptions, and model responses with grades are provided in <xref ref-type="supplementary-material" rid="app3">Multimedia Appendix 3</xref>.</p></sec><sec id="s2-4"><title>Clinical Reasoning</title><p>The second elementary task evaluated was clinical reasoning. Clinical reasoning was evaluated using a modified MedQA dataset, where the original MedQA questions were adapted to be open-ended and included only step 2 and 3 level board exam questions (assessments that focus on higher levels of clinical reasoning).</p><p>The modified MedQA dataset consisted of 4095 training examples, 456 evaluation examples, 200 development examples, and 410 test questions. Reference answers were identified as the original MedQA answer, and rejected answers (used for DPO fine-tuning) were randomly selected from the list of incorrect multiple-choice options from the original dataset.</p><p>Each open-ended question was graded by at least 2 physician annotators. A question was marked correct if the answer provided was equivalent or equally correct to the gold standard answer provided by the MedQA answer key. If there was disagreement over the grade given by the first 2 physician annotators, the third annotator determined the final grade. The full data, along with the graded model responses, can be found in <xref ref-type="supplementary-material" rid="app4">Multimedia Appendix 4</xref>.</p></sec><sec id="s2-5"><title>Summarization</title><p>The third elementary task evaluated was summarization, where the models were asked to summarize discharge summaries into 2&#x2010;3 sentences. Synthetic discharge summary notes were taken from the AISC Augmented Clinical Notes dataset [<xref ref-type="bibr" rid="ref23">23</xref>]. Gold standard summaries were generated by GPT-4 (gpt-4&#x2010;0613) [<xref ref-type="bibr" rid="ref19">19</xref>], and rejected examples for DPO fine-tuning were generated by the Llama2-chat-7B model <xref ref-type="supplementary-material" rid="app5">Multimedia Appendix 5</xref> [<xref ref-type="bibr" rid="ref27">27</xref>]</p><p>The dataset consisted of 4,500 training examples, 300 evaluation examples, 150 development examples, and 300 test examples. LLM summaries were judged by GPT-4 (leveraging a state-of-the-art model as a judge is common practice within computer science [<xref ref-type="bibr" rid="ref28">28</xref>-<xref ref-type="bibr" rid="ref30">30</xref>]) on a five-point Likert scale, with 5 being the best possible score. The full data along with the model grades can be found in <xref ref-type="supplementary-material" rid="app6">Multimedia Appendix 6</xref>.</p></sec><sec id="s2-6"><title>Triage</title><p>The final elementary task evaluated was triage, where the model was asked to triage patient messages for appropriate urgency (urgent vs nonurgent) and the appropriate responding provider (medical assistant vs physician). Patient messages were sourced from Stanford Clinics and graded by author TRS using the criteria provided in <xref ref-type="supplementary-material" rid="app7">Multimedia Appendix 7</xref>.</p><p>A total of 2400 messages were graded. Messages that were ambiguous or did not require a response were not included in our investigation. The final dataset consisted of 1300 training examples, 200 evaluation examples, 100 development examples, and 200 test examples.</p></sec><sec id="s2-7"><title>Fine-Tuning Hyperparameters</title><p>Hyperparameters were tested with a sweep across a range, and the optimal settings were determined by testing on the development set. The learning rates tested were 10<sup>&#x2212;5</sup>, 10<sup>&#x2212;6</sup>, 10<sup>&#x2212;7</sup>, and 10<sup>&#x2212;8</sup>. The beta values tested were 0.1, 0.3, and 0.5.</p><p>Each model&#x2013;hyperparameter configuration was initially tested with 1000 steps. The validation error plot was then analyzed to identify where the validation error plateaued, and the model was trained a second time with that step count.</p><p>All models produced by this investigation (with the exception of patient message triage) are available at the huggingface account <italic>tsavage68</italic>. Training was completed with the following python libraries: Transformers 4.44.2, Pytorch 2.4.0, Datasets 2.21.0, and Tokenizers 0.19.1.</p></sec><sec id="s2-8"><title>Statistical Evaluation</title><p>McNemar test was used for the statistical evaluation of tasks with binary outcomes (classification with text data, clinical reasoning, and triage). A 2-tailed paired t test was used for the statistical evaluation of tasks with ordinal outcomes (summarization). An &#x03B1; of .05 was used as our statistical significance threshold; however, accounting for 5 total tasks by the Bonferroni correction [<xref ref-type="bibr" rid="ref31">31</xref>], we used a <italic>P</italic> value threshold of .01.</p></sec><sec id="s2-9"><title>Ethical Considerations</title><p>Patient messages were sourced from Stanford Health Care outpatient clinics under Stanford University Institutional Review Board Protocols 47618 and 76483, which approved the use of these data for research and quality improvement purposes. All data were deidentified to ensure patient confidentiality. Investigations with patient message data were performed on a Health Insurance Portability and Accountability Act&#x2013;secure Google Cloud Platform account through Stanford University, and resulting models are not shared publicly.</p></sec></sec><sec id="s4" sec-type="results"><title>Results</title><sec id="s3-1"><title>Simple Classification</title><p>In the classification with text data task, we found base Llama3 and Mistral2 achieved <italic>F</italic><sub>1</sub>-scores of 0.63 and 0.73, respectively, when identifying passages describing patients with a UTI. With SFT, Llama3&#x2019;s <italic>F</italic><sub>1</sub>-score increased to 0.98 (<italic>P</italic>&#x003C;.001), whereas Mistral2 increased to 0.97 (<italic>P</italic>&#x003C;.001). With DPO fine-tuning, Llama3&#x2019;s <italic>F</italic><sub>1</sub>-score decreased to 0.95 (<italic>P</italic>=.55 compared to SFT), and Mistral2&#x2019;s <italic>F</italic><sub>1</sub>-score remained 0.97 (<italic>P&#x003E;</italic>.99 compared to SFT). Results are provided in <xref ref-type="fig" rid="figure2">Figure 2A</xref>.</p><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>Comparison of base Llama3 and Mistral2 (gray) against SFT (blue) and DPO (red) fine-tuned variants for the tasks of (A) simple classification, (B) clinical reasoning, (C) summarization, and (D-E) triage. <italic>P</italic> values comparing model variants are provided to the right of each bar graph. Statistically significant <italic>P</italic> values are bolded with an asterisk. A <italic>P</italic> value of .01 was used to account for 5 total tasks by the Bonferroni correction. A definition of <italic>F</italic><sub>1</sub>-score is provided in our glossary of terms. DPO: direct preference optimization; SFT: supervised fine-tuning.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v27i1e76048_fig02.png"/></fig></sec><sec id="s3-2"><title>Clinical Reasoning</title><p>In the clinical reasoning task, Llama3 and Mistral achieved accuracies of 7% and 22% respectively on a modified MedQA dataset. With SFT, the model accuracies increased to 28% (<italic>P</italic>&#x003C;.001) and 33% (<italic>P</italic>&#x003C;.001), respectively. With DPO, the model accuracies increased even further to 36% (<italic>P</italic>=.003) for Llama3 and 40% (<italic>P</italic>=.004) for Mistral2. The results are illustrated in <xref ref-type="fig" rid="figure2">Figure 2B</xref>. There was 97.2% agreement between the 2 grading physicians, and a third tie-breaking physician was only needed in 2.8% of questions.</p></sec><sec id="s3-3"><title>Clinical Summarization</title><p>In the clinical summarization task, Llama3 achieved an average five-point Likert scale rating of 4.11, and Mistral achieved a rating of 3.93, with 5 being the highest score and one the lowest. With SFT, ratings improved to 4.21 (<italic>P</italic>=.005) for Llama3 and 3.98 (<italic>P</italic>=.04) for Mistral2. With DPO, ratings further improved to 4.34 (<italic>P</italic>&#x003C;.001) for Llama3 and 4.08 (<italic>P</italic>&#x003C;.001) for Mistral2. The results are shown in <xref ref-type="fig" rid="figure2">Figure 2C</xref>.</p></sec><sec id="s3-4"><title>Clinical Triage</title><p>In the triage task, we found base Llama3 achieved <italic>F</italic><sub>1</sub>-scores of 0.55 and 0.81 for personnel and urgency triage, respectively, whereas base Mistral2 achieved <italic>F</italic><sub>1</sub>-scores of 0.49 and 0.88. With SFT, Llama3&#x2019;s <italic>F</italic><sub>1</sub>-score increased to 0.58 (<italic>P</italic>=.15) for personnel triage, but its <italic>F</italic><sub>1</sub>-score decreased for urgency triage to 0.79 (<italic>P</italic>=.53). With SFT, Mistral2&#x2019;s personnel triage <italic>F</italic><sub>1</sub>-score increased to 0.58 (<italic>P</italic>&#x003E;.99), and the urgency triage <italic>F</italic><sub>1</sub>-score decreased to 0.87 (<italic>P</italic>=.05). With DPO, Llama3&#x2019;s personnel triage <italic>F</italic><sub>1</sub>-score increased to 0.74 (<italic>P</italic>&#x003C;.001), and the urgency triage <italic>F</italic><sub>1</sub>-score increased to 0.91 (<italic>P</italic>&#x003C;.001). With DPO, Mistral2&#x2019;s personnel triage <italic>F</italic><sub>1</sub>-score increased to 0.66 (<italic>P</italic>&#x003C;.001), but its urgency triage <italic>F</italic><sub>1</sub>-score did not benefit, decreasing to 0.85 (<italic>P</italic>&#x003E;.99). <xref ref-type="fig" rid="figure2">Figure 2D and E</xref> show <italic>F</italic><sub>1</sub>-score results. Sensitivity and specificity data are provided in <xref ref-type="supplementary-material" rid="app8">Multimedia Appendix 8</xref>.</p></sec><sec id="s3-5"><title>Training Dynamics</title><p>Investigations were completed with a single A100 graphics processing unit. Across all tasks, DPO training required approximately 2 to 4 times as many graphics processing unit-hours as SFT. For example, completing 1000 training steps with SFT for text classification required approximately 20 minutes of computational time, while DPO required 50 minutes. Similarly, 1000 steps of text summarization training required approximately 50 minutes with SFT and 160 minutes with DPO.</p></sec></sec><sec id="s5" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Findings</title><p>The results of our investigation demonstrate how fine-tuning with SFT and DPO can improve performance on common clinical natural language tasks. We found that SFT alone was sufficient for text-based classification (<xref ref-type="fig" rid="figure2">Figure 2A</xref>), whereas performance on the more complex tasks of triage, clinical reasoning, and summarization significantly improved with DPO (<xref ref-type="fig" rid="figure2">Figure 2B, C, D, and E</xref>). This nuanced performance advantage with DPO after SFT is an important finding because as artificial intelligence workflows become more common in clinical practice, the use of DPO can translate to tangible benefits for patients and providers. Physicians may reduce their risks of diagnostic errors and find AI-generated summaries more useful, while patients could find their care more equitably and efficiently triaged and expedited.</p><p>We postulate that SFT alone is sufficient for simple classification but not for triage, clinical reasoning, or summarization because SFT strengthens simple &#x201C;word-association&#x201D; reasoning, whereas DPO enables more nuanced interpretation. Because SFT is trained on only desired reference responses, the model is conditioned to recognize high-yield words or basic concepts but not deeper comprehension. By comparison, DPO is trained with both positive and negative examples, and this contrast enables the model to recognize more complex patterns (mimicking better understanding). As a result, we observe that SFT alone is sufficient for classification tasks with clearly defined criteria, such as diagnosing a UTI, whereas DPO fine-tuning is better for classification tasks that have abstract criteria such as patient message triage, clinical reasoning, or summarization. It is important to note, however, that DPO requires approximately 2 to 4 times more computational resources than SFT alone. We conclude that while SFT is sufficient for simple tasks driven by word or entity association, DPO offers superior performance for tasks requiring recognition of more complex patterns&#x2014;albeit at a higher computational cost.</p></sec><sec id="s4-2"><title>Future Directions</title><p>Despite its promise, broader adoption of DPO remains limited by the current software infrastructure. Most leading commercial LLM providers&#x2014;including OpenAI, Google, and Anthropic&#x2014;do not offer DPO fine-tuning as part of their platforms [<xref ref-type="bibr" rid="ref32">32</xref>-<xref ref-type="bibr" rid="ref34">34</xref>]. This lack of support restricts the ability to optimize high-performing models such as GPT-4 (OpenAI), Gemini (Google DeepMind), and Claude-3 (Anthropic) for clinical tasks where alignment with clinician expectations is critical. To unlock the full potential of LLMs in medicine, it is essential for the informatics community and technology providers to collaborate on developing tools and workflows that support DPO fine-tuning for real-world clinical applications.</p></sec><sec id="s4-3"><title>Limitations</title><p>One limitation of our investigation is the reliance on synthetic training data. While synthetic data enables sharing of results and models without the ethical risk of exposing protected health information or having to use patient personal data to develop an AI product without their consent, it introduces bias and lacks the full diversity present in real-world prospective clinical data. As such, we encourage future studies to validate our findings using real-world datasets to ensure generalizability to real-world clinical applications.</p><p>A second limitation of our investigation is that we did not evaluate language models with more than ten billion parameters, although the trend in our results is expected to be consistent, even for larger models. Our exploration of moderately sized models provides valuable insight to guide investment in fine-tuning larger models that will be used in clinical operations or care.</p></sec><sec id="s4-4"><title>Comparison to Prior Work</title><p>A notable strength of our investigation is the use of datasets with fewer than 5000 training examples to reflect the data limitations of clinical medicine. Many existing publications on fine-tuning deploy training sets of more than 30,000 examples [<xref ref-type="bibr" rid="ref5">5</xref>,<xref ref-type="bibr" rid="ref17">17</xref>,<xref ref-type="bibr" rid="ref35">35</xref>,<xref ref-type="bibr" rid="ref36">36</xref>], sizes that are unrealistic for a single hospital system or clinic to achieve. Therefore, our findings prove the feasibility of fine-tuning language models within the realistic data constraints of medicine.</p></sec><sec id="s4-5"><title>Conclusions</title><p>Fine-tuning with SFT alone is sufficient for simple classification tasks with well-defined criteria. In contrast, fine-tuning with DPO requires more computational resources, but better optimizes performance for complex tasks such as triage, clinical reasoning, and summarization.</p></sec></sec></body><back><ack><p>JC has received research funding support in part by the National Institutes of Health (NIH)/National Institute of Allergy and Infectious Diseases (1R01AI17812101), NIH-NCATS-Clinical &#x0026; Translational Science Award (UM1TR004921), Stanford Bio-X Interdisciplinary Initiatives Seed Grants Program (IIP; R12), NIH/Center for Undiagnosed Diseases at Stanford (U01 NS134358), Josiah Macy Jr. Foundation (AI in Medical Education), NIH/National Institute on Drug Abuse Clinical Trials Network (UG1DA015815&#x2014;CTN-0136), Gordon and Betty Moore Foundation (grant 12409), Stanford Artificial Intelligence in Medicine and Imaging&#x2014;Human-Centered Artificial Intelligence (AIMI-HAI) Partnership Grant, Google Inc Research collaboration, and American Heart Association&#x2014;Strategically Focused Research Network&#x2014;Diversity in Clinical Trials. Generative artificial intelligence (AI) was used to rephrase individual sentences for clarity and screen for spelling and grammar errors. Generative AI was also used to trouble shoot python troubleshoot Python code errors. Generative AI was not used to design the study, draft the manuscript, or interpret results.</p></ack><fn-group><fn fn-type="con"><p>TS, SPM, IL, and JC were involved in manuscript writing, reviewing, and editing. TS, IL, and AB wrote all the code used in this manuscript. TS, SPM, ER, and VP participated in model response grading. Data analysis was performed by TS. Funding for the project was secured by JC.</p></fn><fn fn-type="conflict"><p>JC is a co-founder of Reaction Explorer LLC that develops and licenses organic chemistry education software; was paid medical expert witness fees from Sutton Pierce, Younker Hyde MacFarlane, Sykes McAllister, and Elite Experts; was paid consulting fees from ISHI Health; and was paid honoraria or travel expenses for invited presentations by Insitro, General Reinsurance Corporation, Cozeva, and other industry conferences, academic institutions, and health systems.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">DPO</term><def><p>direct preference optimization</p></def></def-item><def-item><term id="abb2">LLM</term><def><p>large language model</p></def></def-item><def-item><term id="abb3">NLP</term><def><p>natural language processing</p></def></def-item><def-item><term id="abb4">SFT</term><def><p>supervised fine-tuning</p></def></def-item><def-item><term id="abb5">UTI</term><def><p>urinary tract infection</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Savage</surname><given-names>T</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>J</given-names> </name><name name-style="western"><surname>Shieh</surname><given-names>L</given-names> </name></person-group><article-title>A large language model screening tool to target patients for best practice alerts: development and validation</article-title><source>JMIR Med Inform</source><year>2023</year><month>11</month><day>27</day><volume>11</volume><issue>1</issue><fpage>e49886</fpage><pub-id pub-id-type="doi">10.2196/49886</pub-id><pub-id pub-id-type="medline">38010803</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Bedi</surname><given-names>S</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Orr-Ewing</surname><given-names>L</given-names> </name><etal/></person-group><article-title>A systematic review of testing and evaluation of healthcare applications of large language models (LLMs)</article-title><source>medRxiv</source><comment>Preprint posted online on  Apr 16, 2024</comment><pub-id pub-id-type="doi">10.1101/2024.04.15.24305869</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Meng</surname><given-names>X</given-names> </name><name name-style="western"><surname>Yan</surname><given-names>X</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>K</given-names> </name><etal/></person-group><article-title>The application of large language models in medicine: a scoping review</article-title><source>iScience</source><year>2024</year><month>05</month><day>17</day><volume>27</volume><issue>5</issue><fpage>109713</fpage><pub-id pub-id-type="doi">10.1016/j.isci.2024.109713</pub-id><pub-id pub-id-type="medline">38746668</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>J</given-names> </name><name name-style="western"><surname>Shi</surname><given-names>E</given-names> </name><name name-style="western"><surname>Yu</surname><given-names>S</given-names> </name><etal/></person-group><article-title>Prompt engineering for healthcare: methodologies and applications</article-title><source>arXiv</source><comment>Preprint posted online on  Apr 28, 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2304.14670</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Saeidi</surname><given-names>A</given-names> </name><name name-style="western"><surname>Verma</surname><given-names>S</given-names> </name><name name-style="western"><surname>Baral</surname><given-names>C</given-names> </name></person-group><article-title>Insights into alignment: evaluating DPO and its variants across multiple tasks</article-title><source>arXiv</source><comment>Preprint posted online on  Apr 23, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2404.14723</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Tunsta</surname><given-names>L</given-names> </name><name name-style="western"><surname>Beeching</surname><given-names>E</given-names> </name><name name-style="western"><surname>Lambert</surname><given-names>N</given-names> </name><etal/></person-group><article-title>Zephyr: direct distillation of LM alignment</article-title><source>arXiv</source><comment>Preprint posted online on  Oct 25, 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2310.16944</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="web"><article-title>Intel/neural-chat-7b-v3-3</article-title><source>Hugging Face</source><access-date>2024-06-16</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://huggingface.co/Intel/neural-chat-7b-v3-3">https://huggingface.co/Intel/neural-chat-7b-v3-3</ext-link></comment></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Che</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Cano</surname><given-names>AH</given-names> </name><name name-style="western"><surname>Romanou</surname><given-names>A</given-names> </name><etal/></person-group><article-title>MEDITRON-70B: scaling medical pretraining for large language models</article-title><source>arXiv</source><comment>Preprint posted online on  Nov 27, 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2311.16079</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Feng</surname><given-names>D</given-names> </name><name name-style="western"><surname>Qin</surname><given-names>B</given-names> </name><name name-style="western"><surname>Huang</surname><given-names>C</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Lei</surname><given-names>W</given-names> </name></person-group><article-title>Towards analyzing and understanding the limitations of DPO: a theoretical perspective</article-title><source>arXiv</source><comment>Preprint posted online on  Apr 6, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2404.04626</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Rafailov</surname><given-names>R</given-names> </name><name name-style="western"><surname>Sharma</surname><given-names>A</given-names> </name><name name-style="western"><surname>Mitchell</surname><given-names>E</given-names> </name><name name-style="western"><surname>Ermon</surname><given-names>S</given-names> </name><name name-style="western"><surname>Manning</surname><given-names>CD</given-names> </name><name name-style="western"><surname>Finn</surname><given-names>C</given-names> </name></person-group><article-title>Direct preference optimization: your language model is secretly a reward model</article-title><source>arXiv</source><comment>Preprint posted online on  Dec 13, 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2305.18290</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="web"><article-title>Llama3/model_card.md at main &#x00B7; meta-llama/llama3</article-title><source>GitHub</source><access-date>2024-05-21</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://github.com/meta-llama/llama3/blob/main/MODEL_CARD.md">https://github.com/meta-llama/llama3/blob/main/MODEL_CARD.md</ext-link></comment></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="web"><article-title>Mistralai/mistral-7B-instruct-v0.2</article-title><source>Hugging Face</source><year>2024</year><access-date>2024-09-09</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2">https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2</ext-link></comment></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Soroush</surname><given-names>A</given-names> </name><name name-style="western"><surname>Glicksberg</surname><given-names>BS</given-names> </name><name name-style="western"><surname>Zimlichman</surname><given-names>E</given-names> </name><etal/></person-group><article-title>Large language models are poor medical coders &#x2014; benchmarking of medical code querying</article-title><source>NEJM AI</source><year>2024</year><month>04</month><day>25</day><volume>1</volume><issue>5</issue><fpage>AIdbp2300040</fpage><pub-id pub-id-type="doi">10.1056/AIdbp2300040</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kanjee</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Crowe</surname><given-names>B</given-names> </name><name name-style="western"><surname>Rodman</surname><given-names>A</given-names> </name></person-group><article-title>Accuracy of a generative artificial intelligence model in a complex diagnostic challenge</article-title><source>JAMA</source><year>2023</year><month>07</month><day>3</day><volume>330</volume><issue>1</issue><fpage>78</fpage><lpage>80</lpage><pub-id pub-id-type="doi">10.1001/jama.2023.8288</pub-id><pub-id pub-id-type="medline">37318797</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>McDuff</surname><given-names>D</given-names> </name><name name-style="western"><surname>Schaekermann</surname><given-names>M</given-names> </name><name name-style="western"><surname>Tu</surname><given-names>T</given-names> </name><etal/></person-group><article-title>Towards accurate differential diagnosis with large language models</article-title><source>arXiv</source><comment>Preprint posted online on  Nov 30, 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2312.00164</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Savage</surname><given-names>T</given-names> </name><name name-style="western"><surname>Nayak</surname><given-names>A</given-names> </name><name name-style="western"><surname>Gallo</surname><given-names>R</given-names> </name><name name-style="western"><surname>Rangan</surname><given-names>E</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>JH</given-names> </name></person-group><article-title>Diagnostic reasoning prompts reveal the potential for large language model interpretability in medicine</article-title><source>arXiv</source><comment>Preprint posted online on  Aug 13, 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2308.06834</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Van Veen</surname><given-names>D</given-names> </name><name name-style="western"><surname>Van Uden</surname><given-names>C</given-names> </name><name name-style="western"><surname>Blankemeier</surname><given-names>L</given-names> </name><etal/></person-group><article-title>Clinical text summarization: adapting large language models can outperform human experts</article-title><source>Res Sq</source><year>2023</year><month>10</month><day>30</day><fpage>rs.3.rs-3483777</fpage><pub-id pub-id-type="doi">10.21203/rs.3.rs-3483777/v1</pub-id><pub-id pub-id-type="medline">37961377</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Friedman</surname><given-names>AB</given-names> </name><name name-style="western"><surname>Delgado</surname><given-names>MK</given-names> </name><name name-style="western"><surname>Weissman</surname><given-names>GE</given-names> </name></person-group><article-title>Artificial intelligence for emergency care triage-much promise, but still much to learn</article-title><source>JAMA Netw Open</source><year>2024</year><month>05</month><day>1</day><volume>7</volume><issue>5</issue><fpage>e248857</fpage><pub-id pub-id-type="doi">10.1001/jamanetworkopen.2024.8857</pub-id><pub-id pub-id-type="medline">38713470</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="web"><article-title>GPT-4 system card</article-title><source>OpenAI</source><access-date>2023-12-25</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://cdn.openai.com/papers/gpt-4-system-card.pdf">https://cdn.openai.com/papers/gpt-4-system-card.pdf</ext-link></comment></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Jin</surname><given-names>D</given-names> </name><name name-style="western"><surname>Pan</surname><given-names>E</given-names> </name><name name-style="western"><surname>Oufattole</surname><given-names>N</given-names> </name><name name-style="western"><surname>Weng</surname><given-names>WH</given-names> </name><name name-style="western"><surname>Fang</surname><given-names>H</given-names> </name><name name-style="western"><surname>Szolovits</surname><given-names>P</given-names> </name></person-group><article-title>What disease does this patient have? A large-scale open domain question answering dataset from medical exams</article-title><source>arXiv</source><comment>Preprint posted online on  Sep 28, 2020</comment><pub-id pub-id-type="doi">10.20944/preprints202105.0498.v1</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="web"><article-title>Step 2 CK content outline &#x0026; specifications</article-title><source>USMLE</source><access-date>2024-10-14</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.usmle.org/prepare-your-exam/step-2-ck-materials/step-2-ck-content-outline-specifications">https://www.usmle.org/prepare-your-exam/step-2-ck-materials/step-2-ck-content-outline-specifications</ext-link></comment></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="web"><article-title>Step 3 exam content</article-title><source>USMLE</source><access-date>2024-10-14</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.usmle.org/step-exams/step-3/step-3-exam-content">https://www.usmle.org/step-exams/step-3/step-3-exam-content</ext-link></comment></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="web"><article-title>Aisc-team-a1/augmented-clinical-notes</article-title><source>Hugging Face</source><access-date>2024-07-29</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://huggingface.co/datasets/aisc-team-a1/augmented-clinical-notes">https://huggingface.co/datasets/aisc-team-a1/augmented-clinical-notes</ext-link></comment></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Touvron</surname><given-names>H</given-names> </name><name name-style="western"><surname>Martin</surname><given-names>L</given-names> </name><name name-style="western"><surname>Stone</surname><given-names>K</given-names> </name><etal/></person-group><article-title>Llama 2: open foundation and fine-tuned chat models</article-title><source>arXiv</source><comment>Preprint posted online on  Jul 19, 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2307.09288</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Colgan</surname><given-names>R</given-names> </name><name name-style="western"><surname>Williams</surname><given-names>M</given-names> </name></person-group><article-title>Diagnosis and treatment of acute uncomplicated cystitis</article-title><source>Am Fam Physician</source><year>2011</year><month>10</month><day>1</day><volume>84</volume><issue>7</issue><fpage>771</fpage><lpage>776</lpage><pub-id pub-id-type="medline">22010614</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Mehnert-Kay</surname><given-names>SA</given-names> </name></person-group><article-title>Diagnosis and management of uncomplicated urinary tract infections</article-title><source>Am Fam Physician</source><year>2024</year><month>10</month><day>14</day><access-date>2025-08-26</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.aafp.org/pubs/afp/issues/2005/0801/p451.html">https://www.aafp.org/pubs/afp/issues/2005/0801/p451.html</ext-link></comment></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="web"><article-title>Llama 2: open foundation and fine-tuned chat models</article-title><source>AI Meta</source><access-date>2023-09-06</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://ai.meta.com/research/publications/llama-2-open-foundation-and-fine-tuned-chat-models/">https://ai.meta.com/research/publications/llama-2-open-foundation-and-fine-tuned-chat-models/</ext-link></comment></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Zhen</surname><given-names>L</given-names> </name><name name-style="western"><surname>Chiang</surname><given-names>WL</given-names> </name><name name-style="western"><surname>Sheng</surname><given-names>Y</given-names> </name><etal/></person-group><article-title>Judging LLM-as-a-judge with MT-bench and chatbot arena</article-title><source>arXiv</source><comment>Preprint posted online on  Dec 23, 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2306.05685</pub-id></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Jones</surname><given-names>CR</given-names> </name><name name-style="western"><surname>Bergen</surname><given-names>BK</given-names> </name></person-group><article-title>People cannot distinguish GPT-4 from a human in a Turing test</article-title><source>arXiv</source><comment>Preprint posted online on  May 9, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2405.08007</pub-id></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Colavito</surname><given-names>G</given-names> </name><name name-style="western"><surname>Lanubile</surname><given-names>F</given-names> </name><name name-style="western"><surname>Novielli</surname><given-names>N</given-names> </name><name name-style="western"><surname>Quaranta</surname><given-names>L</given-names> </name></person-group><article-title>Leveraging GPT-like llms to automate issue labeling</article-title><year>2024</year><month>04</month><day>15</day><conf-name>MSR &#x2019;24</conf-name><conf-date>Apr 15, 2024 to Apr 16, 2025</conf-date><conf-loc>Lisbon Portugal</conf-loc><fpage>469</fpage><lpage>480</lpage><pub-id pub-id-type="doi">10.1145/3643991.3644903</pub-id></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Haynes</surname><given-names>W</given-names> </name></person-group><article-title>Bonferroni correction</article-title><source>Encyclopedia of Systems Biology</source><year>2013</year><publisher-name>Springer</publisher-name><fpage>154</fpage><lpage>154</lpage><pub-id pub-id-type="doi">10.1007/978-1-4419-9863-7_1213</pub-id></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="web"><article-title>Amazon bedrock - user guide</article-title><source>Amazon Web Services</source><access-date>2025-09-12</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://aws.amazon.com/bedrock/">https://aws.amazon.com/bedrock/</ext-link></comment></nlm-citation></ref><ref id="ref33"><label>33</label><nlm-citation citation-type="web"><article-title>OpenAI developer platform</article-title><source>OpenAI Platform</source><access-date>2025-08-26</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://platform.openai.com">https://platform.openai.com</ext-link></comment></nlm-citation></ref><ref id="ref34"><label>34</label><nlm-citation citation-type="web"><article-title>Fine-tuning with the Gemini API</article-title><source>Google AI for Developers</source><access-date>2024-10-15</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://ai.google.dev/gemini-api/docs/model-tuning">https://ai.google.dev/gemini-api/docs/model-tuning</ext-link></comment></nlm-citation></ref><ref id="ref35"><label>35</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Nashaat</surname><given-names>M</given-names> </name><name name-style="western"><surname>Miller</surname><given-names>J</given-names> </name></person-group><article-title>Towards efficient fine-tuning of language models with organizational data for automated software review</article-title><source>IIEEE Trans Software Eng</source><year>2024</year><volume>50</volume><issue>9</issue><fpage>2240</fpage><lpage>2253</lpage><pub-id pub-id-type="doi">10.1109/TSE.2024.3428324</pub-id></nlm-citation></ref><ref id="ref36"><label>36</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Guevara</surname><given-names>M</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>S</given-names> </name><name name-style="western"><surname>Thomas</surname><given-names>S</given-names> </name><etal/></person-group><article-title>Large language models to identify social determinants of health in electronic health records</article-title><source>NPJ Digit Med</source><year>2024</year><month>01</month><day>11</day><volume>7</volume><issue>1</issue><fpage>6</fpage><pub-id pub-id-type="doi">10.1038/s41746-023-00970-0</pub-id><pub-id pub-id-type="medline">38200151</pub-id></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Direct preference optimization loss function.</p><media xlink:href="jmir_v27i1e76048_app1.docx" xlink:title="DOCX File, 14 KB"/></supplementary-material><supplementary-material id="app2"><label>Multimedia Appendix 2</label><p>Glossary of terms.</p><media xlink:href="jmir_v27i1e76048_app2.docx" xlink:title="DOCX File, 14 KB"/></supplementary-material><supplementary-material id="app3"><label>Multimedia Appendix 3</label><p>Urinary tract infection classification files.</p><media xlink:href="jmir_v27i1e76048_app3.xlsx" xlink:title="XLSX File, 699 KB"/></supplementary-material><supplementary-material id="app4"><label>Multimedia Appendix 4</label><p>Clinical reasoning files.</p><media xlink:href="jmir_v27i1e76048_app4.xlsx" xlink:title="XLSX File, 3197 KB"/></supplementary-material><supplementary-material id="app5"><label>Multimedia Appendix 5</label><p>Python code used to generate clinical summarization examples.</p><media xlink:href="jmir_v27i1e76048_app5.docx" xlink:title="DOCX File, 30 KB"/></supplementary-material><supplementary-material id="app6"><label>Multimedia Appendix 6</label><p>Summarization files.</p><media xlink:href="jmir_v27i1e76048_app6.xlsx" xlink:title="XLSX File, 14624 KB"/></supplementary-material><supplementary-material id="app7"><label>Multimedia Appendix 7</label><p>Triage criteria.</p><media xlink:href="jmir_v27i1e76048_app7.docx" xlink:title="DOCX File, 15 KB"/></supplementary-material><supplementary-material id="app8"><label>Multimedia Appendix 8</label><p>Python code for supervised fine-tuning and direct preference optimization.</p><media xlink:href="jmir_v27i1e76048_app8.docx" xlink:title="DOCX File, 25 KB"/></supplementary-material></app-group></back></article>