<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">J Med Internet Res</journal-id><journal-id journal-id-type="publisher-id">jmir</journal-id><journal-id journal-id-type="index">1</journal-id><journal-title>Journal of Medical Internet Research</journal-title><abbrev-journal-title>J Med Internet Res</abbrev-journal-title><issn pub-type="epub">1438-8871</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v27i1e72984</article-id><article-id pub-id-type="doi">10.2196/72984</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Large Language Model Symptom Identification From Clinical Text: Multicenter Study</article-title></title-group><contrib-group><contrib contrib-type="author"><name name-style="western"><surname>McMurry</surname><given-names>Andrew J</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Phelan</surname><given-names>Dylan</given-names></name><degrees>MS</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Dixon</surname><given-names>Brian E</given-names></name><degrees>MPA, PhD</degrees><xref ref-type="aff" rid="aff3">3</xref><xref ref-type="aff" rid="aff4">4</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Geva</surname><given-names>Alon</given-names></name><degrees>MPH, MD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff5">5</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Gottlieb</surname><given-names>Daniel</given-names></name><degrees>MPA</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff6">6</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Jones</surname><given-names>James R</given-names></name><degrees>MPhil</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Terry</surname><given-names>Michael</given-names></name><degrees>BS</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Taylor</surname><given-names>David E</given-names></name><degrees>BS</degrees><xref ref-type="aff" rid="aff4">4</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Callaway</surname><given-names>Hannah</given-names></name><degrees>MS</degrees><xref ref-type="aff" rid="aff4">4</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Manoharan</surname><given-names>Sneha</given-names></name><degrees>MS</degrees><xref ref-type="aff" rid="aff4">4</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Miller</surname><given-names>Timothy</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Olson</surname><given-names>Karen L</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Mandl</surname><given-names>Kenneth D</given-names></name><degrees>MPH, MD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref></contrib></contrib-group><aff id="aff1"><institution>Computational Health Informatics Program, Boston Children's Hospital</institution><addr-line>401 Park Drive, LM5506, Mail Stop BCH3187</addr-line><addr-line>Boston</addr-line><addr-line>MA</addr-line><country>United States</country></aff><aff id="aff2"><institution>Department of Pediatrics, Harvard Medical School</institution><addr-line>Boston</addr-line><addr-line>MA</addr-line><country>United States</country></aff><aff id="aff3"><institution>Department of Health Policy and Management, Fairbanks School of Public Health, Indiana University</institution><addr-line>Indianapolis</addr-line><addr-line>IN</addr-line><country>United States</country></aff><aff id="aff4"><institution>Center for Biomedical Informatics, Regenstrief Institute</institution><addr-line>Indianapolis</addr-line><addr-line>IN</addr-line><country>United States</country></aff><aff id="aff5"><institution>Department of Anesthesia, Harvard Medical School</institution><addr-line>Boston</addr-line><addr-line>MA</addr-line><country>United States</country></aff><aff id="aff6"><institution>Department of Biomedical Informatics, Harvard Medical School</institution><addr-line>Boston</addr-line><addr-line>MA</addr-line><country>United States</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Coristine</surname><given-names>Andrew</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Sarma</surname><given-names>Karthik</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Dohopolski</surname><given-names>Michael</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>nomula</surname><given-names>varun kumar</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Kenneth D Mandl, MPH, MD, Computational Health Informatics Program, Boston Children's Hospital, 401 Park Drive, LM5506, Mail Stop BCH3187, Boston, MA, 02215, United States, 1 617-355-4145; <email>Kenneth.Mandl@Childrens.Harvard.edu</email></corresp></author-notes><pub-date pub-type="collection"><year>2025</year></pub-date><pub-date pub-type="epub"><day>31</day><month>7</month><year>2025</year></pub-date><volume>27</volume><elocation-id>e72984</elocation-id><history><date date-type="received"><day>23</day><month>02</month><year>2025</year></date><date date-type="rev-recd"><day>17</day><month>06</month><year>2025</year></date><date date-type="accepted"><day>18</day><month>06</month><year>2025</year></date></history><copyright-statement>&#x00A9; Andrew J McMurry, Dylan Phelan, Brian E Dixon, Alon Geva, Daniel Gottlieb, James R Jones, Michael Terry, David E Taylor, Hannah Callaway, Sneha Manoharan, Timothy Miller, Karen L Olson, Kenneth D Mandl. Originally published in the Journal of Medical Internet Research (<ext-link ext-link-type="uri" xlink:href="https://www.jmir.org">https://www.jmir.org</ext-link>), 31.7.2025. </copyright-statement><copyright-year>2025</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in the Journal of Medical Internet Research (ISSN 1438-8871), is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://www.jmir.org/">https://www.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://www.jmir.org/2025/1/e72984"/><abstract><sec><title>Background</title><p>Recognizing patient symptoms is fundamental to medicine, research, and public health. However, symptoms are often underreported in coded formats even though they are routinely documented in physician notes. Large language models (LLMs), noted for their generalizability, could help bridge this gap by mimicking the role of human expert chart reviewers for symptom identification.</p></sec><sec><title>Objective</title><p>The primary objective of this multisite study was to measure the accurate identification of infectious respiratory disease symptoms using LLMs instructed to follow chart review guidelines. The secondary objective was to evaluate LLM generalizability in multisite settings without the need for site-specific training, fine-tuning, or customization.</p></sec><sec sec-type="methods"><title>Methods</title><p>Four LLMs were evaluated: GPT-4, GPT-3.5, Llama2 70B, and Mixtral 8&#x00D7;7B. LLM prompts were instructed to take on the role of chart reviewers and follow symptom annotation guidelines when assessing physician notes. Ground truth labels for each note were annotated by subject matter experts. Optimal LLM prompting strategies were selected using a development corpus of 103 notes from the emergency department at Boston Children&#x2019;s Hospital. The performance of each LLM was measured using a test corpus with 202 notes from Boston Children&#x2019;s Hospital. The performance of an <italic>International Classification of Diseases, Tenth Revision</italic> (<italic>ICD-10</italic>)&#x2013;based method was also measured as a baseline. Generalizability of the most performant LLM was then measured in a validation corpus of 308 notes from 21 emergency departments in the Indiana Health Information Exchange.</p></sec><sec sec-type="results"><title>Results</title><p>Symptom identification accuracy was superior for every LLM tested for each infectious disease symptom compared to an <italic>ICD-10</italic>&#x2013;based method (<italic>F</italic><sub>1</sub>-score=45.1%). GPT-4 was the highest scoring (<italic>F</italic><sub>1</sub>-score=91.4%; <italic>P</italic>&#x003C;.001) and was significantly better than the <italic>ICD-10</italic>&#x2013;based method, followed by GPT-3.5 (<italic>F</italic><sub>1</sub>-score=90.0%; <italic>P</italic>&#x003C;.001), Llama2 (<italic>F</italic><sub>1</sub>-score=81.7%; <italic>P</italic>&#x003C;.001), and Mixtral (<italic>F</italic><sub>1</sub>-score=83.5%; <italic>P</italic>&#x003C;.001). For the validation corpus, performance of the <italic>ICD-10</italic>&#x2013;based method decreased (<italic>F</italic><sub>1</sub>-score=26.9%), while GPT-4 increased (<italic>F</italic><sub>1</sub>-score=94.0%), demonstrating better generalizability using GPT-4 (<italic>P</italic>&#x003C;.001).</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>LLMs significantly outperformed an <italic>ICD-10</italic>&#x2013;based method for respiratory symptom identification in emergency department electronic health records. GPT-4 demonstrated the highest accuracy and generalizability, suggesting that LLMs may augment or replace traditional approaches. LLMs can be instructed to mimic human chart reviewers with high accuracy. Future work should assess broader symptom types and health care settings.</p></sec></abstract><kwd-group><kwd>natural language processing</kwd><kwd>artificial intelligence</kwd><kwd>large language models</kwd><kwd>symptom recognition</kwd><kwd>clinical text mining</kwd><kwd>medical informatics</kwd><kwd>infectious disease surveillance</kwd><kwd>epidemiologic methods</kwd><kwd>emergency medical services</kwd><kwd>electronic health records</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>To practice medicine, accurate identification and interpretation of symptoms are paramount. Symptoms are primary indicators of patient health, underpinning diagnostic processes [<xref ref-type="bibr" rid="ref1">1</xref>] and choice of therapeutic interventions [<xref ref-type="bibr" rid="ref2">2</xref>]. Identifying symptoms is also fundamental to public health [<xref ref-type="bibr" rid="ref3">3</xref>,<xref ref-type="bibr" rid="ref4">4</xref>], medication safety [<xref ref-type="bibr" rid="ref5">5</xref>,<xref ref-type="bibr" rid="ref6">6</xref>], clinical research [<xref ref-type="bibr" rid="ref7">7</xref>,<xref ref-type="bibr" rid="ref8">8</xref>], and clinical trials [<xref ref-type="bibr" rid="ref9">9</xref>-<xref ref-type="bibr" rid="ref13">13</xref>]. Though symptoms are routinely documented in physician notes, coded formats such as the <italic>International Classification of Diseases, Tenth Revision</italic> (<italic>ICD-10</italic>) [<xref ref-type="bibr" rid="ref14">14</xref>] often underreport patient symptoms [<xref ref-type="bibr" rid="ref4">4</xref>,<xref ref-type="bibr" rid="ref15">15</xref>-<xref ref-type="bibr" rid="ref18">18</xref>]. The gap between medical coding practices and richer phenotyping has motivated many efforts to develop natural language processing (NLP) of physician notes [<xref ref-type="bibr" rid="ref17">17</xref>].</p><p>Traditional NLP methods for symptom identification [<xref ref-type="bibr" rid="ref15">15</xref>,<xref ref-type="bibr" rid="ref18">18</xref>,<xref ref-type="bibr" rid="ref19">19</xref>] typically target specific note sections [<xref ref-type="bibr" rid="ref20">20</xref>,<xref ref-type="bibr" rid="ref21">21</xref>] such as the chief complaint [<xref ref-type="bibr" rid="ref19">19</xref>,<xref ref-type="bibr" rid="ref22">22</xref>-<xref ref-type="bibr" rid="ref24">24</xref>] and often struggle to interpret if or when symptoms are positive [<xref ref-type="bibr" rid="ref25">25</xref>-<xref ref-type="bibr" rid="ref27">27</xref>]. The context [<xref ref-type="bibr" rid="ref18">18</xref>,<xref ref-type="bibr" rid="ref21">21</xref>,<xref ref-type="bibr" rid="ref28">28</xref>] surrounding infectious respiratory diseases includes symptoms pertaining to acute infections, noninfectious conditions, treatment side effects [<xref ref-type="bibr" rid="ref6">6</xref>], indications for treatment, or patient instructions (eg, &#x201C;Use albuterol inhaler as needed for difficulty breathing&#x201D;).</p><p>Large language models (LLMs) hold potential to overcome such limitations [<xref ref-type="bibr" rid="ref29">29</xref>,<xref ref-type="bibr" rid="ref30">30</xref>]. As LLMs are derived from population scale examples, they may better infer symptoms from internet text such as articles about symptom checklists [<xref ref-type="bibr" rid="ref1">1</xref>,<xref ref-type="bibr" rid="ref31">31</xref>], disease progression [<xref ref-type="bibr" rid="ref32">32</xref>], and medical decision-making [<xref ref-type="bibr" rid="ref2">2</xref>]. Unlike traditional clinical NLP models, LLMs are not trained to any specific domain, which means that LLMs should be more generalizable to documentation variation across health care locations and may not require site-specific training [<xref ref-type="bibr" rid="ref20">20</xref>,<xref ref-type="bibr" rid="ref33">33</xref>] to achieve state-of-the-art accuracy.</p><p>We sought to measure the accuracy of LLMs for symptom identification, with a focus on infectious respiratory disease symptoms [<xref ref-type="bibr" rid="ref4">4</xref>]. The code and results are available free of charge with the Apache open-source license 2.0 [<xref ref-type="bibr" rid="ref34">34</xref>].</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Study Design</title><p>This is a multisite retrospective study of infectious respiratory disease symptoms documented in electronic health records. Ground truth symptom labels were annotated by human expert chart reviewers. Two symptom identification methods were compared to ground truth labels: (1) an <italic>ICD-10</italic>&#x2013;based method using coded data and (2) an LLM-based method using unstructured emergency department (ED) notes. LLM prompting strategies were developed for Llama 2 70B Chat [<xref ref-type="bibr" rid="ref35">35</xref>], Mistral AI Mixtral 8&#x00D7;7B Instruct [<xref ref-type="bibr" rid="ref36">36</xref>], GPT-3.5 turbo (version 0125) [<xref ref-type="bibr" rid="ref37">37</xref>] and GPT-4 turbo (version 0125) [<xref ref-type="bibr" rid="ref37">37</xref>]. The selection of LLMs at the time of experimentation represented the state of the art available in our Health Insurance Portability and Accountability Act (HIPAA)&#x2013;authorized environments.</p></sec><sec id="s2-2"><title>Setting</title><p>Boston Children&#x2019;s Hospital (BCH), a large Northeastern urban pediatric academic medical center, and the Indiana Health Information Exchange (IHIE) [<xref ref-type="bibr" rid="ref38">38</xref>,<xref ref-type="bibr" rid="ref39">39</xref>], a Midwestern statewide health information exchange network, were the study sites. Notes from BCH ED patients (aged 21 years and younger) and from IHIE ED patients (any age) with a COVID-19 diagnosis between March 1, 2020, and May 31, 2022, were eligible for inclusion into the study corpus.</p></sec><sec id="s2-3"><title>Study Corpus</title><p>A study corpus of 613 notes was selected to ensure that it contained examples of rare symptoms. Apache cTAKES [<xref ref-type="bibr" rid="ref40">40</xref>] was used to first identify positive symptoms in each note. At BCH, notes were then selected to include at least 30 positive examples for each of the 11 symptoms, as well as notes with no positive symptoms. These were used for a development corpus (103 notes) to select optimal strategies for each LLM, and a test corpus (202 notes) to measure accuracy. At IHIE, a validation corpus (308 notes) was randomly selected from a larger sample of 300 positive notes for each symptom and used to assess multisite generalizability in a setting comprising many health care locations.</p></sec><sec id="s2-4"><title>Ground Truth</title><p>Three BCH experts collaboratively defined inclusion and exclusion criteria for symptom annotation guidelines [<xref ref-type="bibr" rid="ref4">4</xref>]. They performed iterative cycles of independent chart review, collaborative adjudication of disagreements, and collaborative refinement of symptom annotation guidelines until a consensus was reached. Expert pairs reviewed notes from their own site. Interrater reliability was assessed with the kappa statistic [<xref ref-type="bibr" rid="ref41">41</xref>,<xref ref-type="bibr" rid="ref42">42</xref>] (overall mean 0.96, SD 0.07; details in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>).</p></sec><sec id="s2-5"><title>Measures</title><p>Eleven symptoms related to infectious respiratory disease were measured: congestion or runny nose, cough, diarrhea, dyspnea (shortness of breath), fatigue, fever or chills, headache, loss of taste or smell, muscle or body aches, nausea or vomiting, and sore throat.</p><p><italic>F</italic><sub>1</sub>-scores, precision, and recall were calculated for each symptom and for all symptoms combined [<xref ref-type="bibr" rid="ref42">42</xref>]. Micro <italic>F</italic><sub>1</sub>-scores were used, rather than macro <italic>F</italic><sub>1</sub>-scores, to allow for stronger competition from <italic>ICD-10</italic>&#x2013;based metrics, which were quite poor for some symptoms. McNemar tests were used to evaluate LLM versus <italic>ICD-10</italic>&#x2013;based performance. With an overall &#x03B1; of .05, a Bonferroni adjustment for 12 comparisons (11 symptoms plus no symptoms) set the threshold at <italic>P</italic>&#x003C;.0042.</p></sec><sec id="s2-6"><title>Comparator</title><p><italic>ICD-10</italic> codelists (<xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>) [<xref ref-type="bibr" rid="ref4">4</xref>] for each symptom were compiled by 3 experts at BCH using online resources [<xref ref-type="bibr" rid="ref43">43</xref>,<xref ref-type="bibr" rid="ref44">44</xref>]. The panel collaboratively reviewed whether each candidate code met the inclusion or exclusion criteria defined in the symptom annotation guidelines. <italic>ICD-10</italic> codes recorded at the time of ED discharge were matched against the final symptom codelists.</p></sec><sec id="s2-7"><title>Prompt Engineering</title><p>For each LLM, 5 chart review prompts [<xref ref-type="bibr" rid="ref45">45</xref>] were developed to follow symptom annotation guidelines. An overview is shown in <xref ref-type="fig" rid="figure1">Figure 1</xref>. Prompts ranged in complexity from an identity prompt, where LLMs were instructed to assume the identity of a chart reviewer, to a verbose prompt containing symptom-specific synonyms and inclusion and exclusion criteria. The 5 prompts were evaluated across 4 output parsing pipelines, yielding 20 prompting strategies for each LLM (<xref ref-type="supplementary-material" rid="app3">Multimedia Appendix 3</xref>). All pipelines normalized LLM output into a structured CSV format containing symptoms identified in each note. Of the 4 LLM output parsing pipelines, 2 handled text and 2 handled JSON.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Large language model prompts intended to reproduce chart review criteria. The identity prompt contains text present in every type of prompt. The rules prompt extends the identity prompt with basic chart review criteria. Include and exclude prompts extend the rules prompt with symptom-specific criteria. The verbose prompt combines all prompts to approximate the same chart review criteria used by human subject matter experts.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v27i1e72984_fig01.png"/></fig></sec><sec id="s2-8"><title>Ethical Considerations</title><p>The BCH Committee on Clinical Investigation (BCH IRB-P00043392) and the Indiana University Institutional Review Board (IU IRB 24673) each determined the study to be exempt from full human participant oversight. Waivers of consent were obtained to allow corpus extraction and chart review of ED notes for institutional review board&#x2013;approved study personnel. Notes were not shared between sites and not anonymized prior to LLM processing. All analyses were conducted in HIPAA-secure environments. Open-source LLMs were hosted on premises. OpenAI models were hosted by Azure under a Business Associates Agreement for HIPAA compliance. Clinical notes and patient data have been omitted from figures, tables, and appendices; only aggregate statistics are reported.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><p>Demographic characteristics of patients with notes in the study corpus are presented in <xref ref-type="supplementary-material" rid="app4">Multimedia Appendix 4</xref>. Frequencies for each symptom are in <xref ref-type="supplementary-material" rid="app5">Multimedia Appendix 5</xref>. <xref ref-type="fig" rid="figure2">Figure 2</xref> shows symptom identification <italic>F</italic><sub>1</sub>-scores in the development corpus using the optimal prompting strategy for each LLM. Optimal LLM instructions for chart review varied considerably among LLMs (<xref ref-type="supplementary-material" rid="app6">Multimedia Appendix 6</xref>). Every LLM was optimized using the JSON output parsing pipeline.</p><p>The performance of each symptom identification method was evaluated with the test corpus using the <italic>F</italic><sub>1</sub>-score statistic. The <italic>ICD-10</italic>&#x2013;based method performed worst (<italic>F</italic><sub>1</sub>-score=45.1%) compared to each LLM method. GPT-4 was the highest-scoring LLM (<italic>F</italic><sub>1</sub>-score=91.4%; <italic>P</italic>&#x003C;.001), followed by GPT-3.5 (<italic>F</italic><sub>1</sub>-score=90.0%; <italic>P</italic>&#x003C;.001), Llama2 (<italic>F</italic><sub>1</sub>-score=81.7%; <italic>P</italic>&#x003C;.001), and Mixtral (<italic>F</italic><sub>1</sub>-score=83.5%; <italic>P</italic>&#x003C;.001). <xref ref-type="fig" rid="figure3">Figure 3</xref> shows symptom accuracy for the optimal prompting strategy of each LLM as well as the <italic>ICD-10</italic>&#x2013;based method. <xref ref-type="supplementary-material" rid="app7">Multimedia Appendix 7</xref> contains method details and statistical results.</p><p>Using the validation corpus from IHIE, GPT-4 accuracy was measured with no further model training or fine-tuning of the BCH model. Accuracy improved for GPT-4 (<italic>F</italic><sub>1</sub>-score=94.0%; an absolute increase of 2.6%) but accuracy for the <italic>ICD-10</italic>&#x2013;based method was worse (<italic>F</italic><sub>1</sub>-score=26.9%; an absolute decrease of 18.2%). Generalizability from the BCH to IHIE corpus was better for GPT-4 than the <italic>ICD-10</italic> method (<italic>P</italic>&#x003C;.001). <xref ref-type="fig" rid="figure4">Figure 4</xref> shows that GPT-4 accuracy was higher than the <italic>ICD-10</italic>&#x2013;based method for all symptoms at both sites. Details and results are in <xref ref-type="supplementary-material" rid="app8">Multimedia Appendix 8</xref>.</p><fig position="float" id="figure2"><label>Figure 2.</label><caption><p><italic>F</italic><sub>1</sub>-scores for large language model (LLM) optimal prompt strategies using the development corpus. Each color denotes a symptom identification method using its optimal prompting strategy: Llama2 (identity prompt), Mixtral (exclude prompt), GPT-3.5 (identity prompt), and GPT-4 (include prompt). Each of the 11 infectious disease symptoms are shown as well as a summary score for all symptoms. Overall, GPT-4 performed best, with a micro <italic>F</italic><sub>1</sub>-score of 90.8% for all symptoms combined.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v27i1e72984_fig02.png"/></fig><fig position="float" id="figure3"><label>Figure 3.</label><caption><p><italic>F</italic><sub>1</sub>-scores for large language model (LLM) optimal prompt strategies using the test corpus. Each color denotes a symptom identification method: Llama2, Mixtral, GPT-3.5, GPT-4, and the <italic>ICD-10</italic>&#x2013;based method. Each of the 11 infectious respiratory disease symptoms are shown as well as a summary score for all symptoms. GPT-4 performed best, with an overall micro <italic>F</italic><sub>1</sub>-score of 91.4% for all symptoms combined. <italic>ICD-10</italic>: <italic>International Classification of Diseases, Tenth Revision.</italic></p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v27i1e72984_fig03.png"/></fig><fig position="float" id="figure4"><label>Figure 4.</label><caption><p>Generalizability of symptom identification accuracy across sites. One site is a large Northeastern urban pediatric academic medical center (BCH). The other is a Midwestern statewide health information exchange (IHIE) that provided data from 21 emergency departments. GPT-4 and the <italic>ICD-10</italic>&#x2013;based method were compared. <italic>F</italic><sub>1</sub>-scores are shown for each method and symptom benchmarked against ground truth labels from chart reviews in test and validation corpora. BCH: Boston Children&#x2019;s Hospital; <italic>ICD-10</italic>: <italic>International Classification of Diseases, Tenth Revision</italic>; IHIE: Indiana Health Information Exchange.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v27i1e72984_fig04.png"/></fig></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Results</title><p>In this multisite study, LLM-based symptom identification consistently outperformed <italic>ICD-10</italic>&#x2013;based methods for each infectious respiratory disease symptom evaluated. GPT-4 achieved the highest <italic>F</italic><sub>1</sub>-score, and results generalized well to an external validation corpus without customization. Low accuracy for <italic>ICD-10</italic>&#x2013;based symptom identification and variability in multisite studies are consistent with prior literature [<xref ref-type="bibr" rid="ref16">16</xref>,<xref ref-type="bibr" rid="ref18">18</xref>,<xref ref-type="bibr" rid="ref46">46</xref>].</p><p>Importantly, LLM strategies all used &#x201C;zero-shot&#x201D; prompts and required no site-specific artificial intelligence training, fine-tuning, or ground truth examples. The potential to reduce human labor represents a major advantage of LLM methods over traditional NLP methods that require human labor to curate symptom concept dictionaries, annotate ground truth examples, and calibrate at each health care site.</p></sec><sec id="s4-2"><title>Limitations and Future Work</title><p>This study focused specifically on identifying symptoms of infectious respiratory diseases. However, generalizability of LLMs to other clinical domains and broader symptom categories remains to be validated. Furthermore, while GPT-4 performance was excellent in a validation corpus from 21 EDs, other settings, including primary care, should be studied. Other LLM models such as Google Gemini, Anthropic Claude, and DeepSeek R1 were not available for use in our HIPAA-secure settings. Future work should explore recent LLM developments. For example, the latest agentic methods could generalize to new symptom sets dynamically through multistage interactions with users.</p><p>It was beyond the scope of this study to estimate symptom prevalence in the study population. However, given outstanding LLM performance, one could approximate true prevalence from apparent prevalence in electronic health records [<xref ref-type="bibr" rid="ref47">47</xref>]. Future work is needed to incorporate LLM-assisted chart review and pattern recognition. Doing this in real time, at a national scale, would truly improve public health efforts [<xref ref-type="bibr" rid="ref3">3</xref>,<xref ref-type="bibr" rid="ref47">47</xref>,<xref ref-type="bibr" rid="ref48">48</xref>].</p></sec><sec id="s4-3"><title>Conclusions</title><p>Our findings underscore the potential of LLMs to address gaps in traditional methods to identify symptoms in health records, paving the way for advancements in syndromic biosurveillance and other use cases. LLMs can be instructed to mimic human chart reviewers with high accuracy. Future work should assess broader symptom types and health care settings.</p></sec></sec></body><back><ack><p>Support for this study was provided by the Advanced Research Projects Agency for Health (ARPA-H) and the National Center for Advancing Translational Sciences (NCATS; 75N95023D00001, 75N95023F00019, and 75N95024F00013), National Institutes of Health (U01TR002623), the Office of the National Coordinator for Health Information Technology (ONC; 90AX0031 and 90C30007), and the Centers for Disease Control and Prevention (CDC) of the US Department of Health and Human Services as part of a financial assistance award. Generative artificial intelligence was not used to design or conduct this study or prepare the manuscript.</p></ack><notes><sec><title>Data Availability</title><p>The emergency department (ED) notes analyzed during this study are protected under privacy and confidentiality regulations and cannot be shared openly. However, the prompts, supporting datasets (excluding ED notes) and detailed methodological descriptions are available to facilitate reproducibility from the corresponding author or from the repository on GitHub [<xref ref-type="bibr" rid="ref34">34</xref>]. Access will be granted in accordance with ethical and institutional guidelines. All code, including large language model prompts, and results are freely available on GitHub [<xref ref-type="bibr" rid="ref34">34</xref>].</p></sec></notes><fn-group><fn fn-type="con"><p>As per guidelines of the International Committee of Medical Journal Editors, all authors contributed to the conceptualization or design of the study and the acquisition, analysis, or interpretation of the data as follows: conceptualization (KDM, AJM, DP, AG, TM, JRJ, and DG), data curation (AJM, AG, HC, and SM), formal analysis (AJM and DP), funding acquisition (KDM), investigation (AJM, DP, AG, DET, HC, and SM), methodology (KDM, AJM, DP, DG, TM, JRJ, and KLO), project administration (JRJ, BED, and DET), software (DP, AJM, MT, and DG), supervision (KDM and TM), validation (BED, DET, HC, and SM), and visualization (DP and JRJ). In terms of manuscript preparation, drafts of the manuscript were written by AJM, DP, KDM, and KLO; critical input was solicited from all authors and incorporated. All authors reviewed, edited, and approved the final version.</p></fn><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">BCH</term><def><p>Boston Children&#x2019;s Hospital</p></def></def-item><def-item><term id="abb2">ED</term><def><p>emergency department</p></def></def-item><def-item><term id="abb3">HIPAA</term><def><p>Health Insurance Portability and Accountability Act</p></def></def-item><def-item><term id="abb4"><italic>ICD-10</italic></term><def><p><italic>International Classification of Diseases, Tenth Revision</italic></p></def></def-item><def-item><term id="abb5">IHIE</term><def><p>Indiana Health Information Exchange</p></def></def-item><def-item><term id="abb6">LLM</term><def><p>large language model</p></def></def-item><def-item><term id="abb7">NLP</term><def><p>natural language processing</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Chen</surname><given-names>A</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>DO</given-names> </name><name name-style="western"><surname>Tian</surname><given-names>L</given-names> </name></person-group><article-title>Benchmarking the symptom-checking capabilities of ChatGPT for a broad range of diseases</article-title><source>J Am Med Inform Assoc</source><year>2024</year><month>09</month><day>1</day><volume>31</volume><issue>9</issue><fpage>2084</fpage><lpage>2088</lpage><pub-id pub-id-type="doi">10.1093/jamia/ocad245</pub-id><pub-id pub-id-type="medline">38109889</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Harskamp</surname><given-names>RE</given-names> </name><name name-style="western"><surname>De Clercq</surname><given-names>L</given-names> </name></person-group><article-title>Performance of ChatGPT as an AI-assisted decision support tool in medicine: a proof-of-concept study for interpreting symptoms and management of common cardiac conditions (AMSTELHEART-2)</article-title><source>Acta Cardiol</source><year>2024</year><month>03</month><day>15</day><volume>79</volume><issue>3</issue><fpage>358</fpage><lpage>366</lpage><pub-id pub-id-type="doi">10.1080/00015385.2024.2303528</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Mandl</surname><given-names>KD</given-names> </name><name name-style="western"><surname>Gottlieb</surname><given-names>D</given-names> </name><name name-style="western"><surname>Mandel</surname><given-names>JC</given-names> </name><etal/></person-group><article-title>Push button population health: the SMART/HL7 FHIR Bulk Data Access application programming interface</article-title><source>NPJ Digit Med</source><year>2020</year><month>11</month><day>19</day><volume>3</volume><issue>1</issue><fpage>151</fpage><pub-id pub-id-type="doi">10.1038/s41746-020-00358-4</pub-id><pub-id pub-id-type="medline">33299056</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>McMurry</surname><given-names>AJ</given-names> </name><name name-style="western"><surname>Zipursky</surname><given-names>AR</given-names> </name><name name-style="western"><surname>Geva</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Moving biosurveillance beyond coded data using AI for symptom detection from physician notes: retrospective cohort study</article-title><source>J Med Internet Res</source><year>2024</year><month>04</month><day>4</day><volume>26</volume><fpage>e53367</fpage><pub-id pub-id-type="doi">10.2196/53367</pub-id><pub-id pub-id-type="medline">38573752</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Matheny</surname><given-names>ME</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>J</given-names> </name><name name-style="western"><surname>Smith</surname><given-names>JC</given-names> </name><etal/></person-group><article-title>Enhancing postmarketing surveillance of medical products with large language models</article-title><source>JAMA Netw Open</source><year>2024</year><month>08</month><day>1</day><volume>7</volume><issue>8</issue><fpage>e2428276</fpage><pub-id pub-id-type="doi">10.1001/jamanetworkopen.2024.28276</pub-id><pub-id pub-id-type="medline">39150707</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Henry</surname><given-names>S</given-names> </name><name name-style="western"><surname>Buchan</surname><given-names>K</given-names> </name><name name-style="western"><surname>Filannino</surname><given-names>M</given-names> </name><name name-style="western"><surname>Stubbs</surname><given-names>A</given-names> </name><name name-style="western"><surname>Uzuner</surname><given-names>O</given-names> </name></person-group><article-title>2018 n2c2 shared task on adverse drug events and medication extraction in electronic health records</article-title><source>J Am Med Inform Assoc</source><year>2020</year><month>01</month><day>1</day><volume>27</volume><issue>1</issue><fpage>3</fpage><lpage>12</lpage><pub-id pub-id-type="doi">10.1093/jamia/ocz166</pub-id><pub-id pub-id-type="medline">31584655</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Clark-Cutaia</surname><given-names>MN</given-names> </name><name name-style="western"><surname>Rivera</surname><given-names>E</given-names> </name><name name-style="western"><surname>Iroegbu</surname><given-names>C</given-names> </name><name name-style="western"><surname>Arneson</surname><given-names>G</given-names> </name><name name-style="western"><surname>Deng</surname><given-names>R</given-names> </name><name name-style="western"><surname>Anastasi</surname><given-names>JK</given-names> </name></person-group><article-title>Exploring the evidence: symptom burden in chronic kidney disease</article-title><source>Nephrol Nurs J</source><year>2022</year><volume>49</volume><issue>3</issue><fpage>227</fpage><lpage>255</lpage><pub-id pub-id-type="doi">10.37526/1526-744X.2022.49.3.227</pub-id><pub-id pub-id-type="medline">35802361</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Stubbs</surname><given-names>A</given-names> </name><name name-style="western"><surname>Kotfila</surname><given-names>C</given-names> </name><name name-style="western"><surname>Xu</surname><given-names>H</given-names> </name><name name-style="western"><surname>Uzuner</surname><given-names>&#x00D6;</given-names> </name></person-group><article-title>Identifying risk factors for heart disease over time: overview of 2014 i2b2/UTHealth shared task Track 2</article-title><source>J Biomed Inform</source><year>2015</year><month>12</month><volume>58 Suppl</volume><issue>Suppl</issue><fpage>S67</fpage><lpage>S77</lpage><pub-id pub-id-type="doi">10.1016/j.jbi.2015.07.001</pub-id><pub-id pub-id-type="medline">26210362</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ni</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Kennebeck</surname><given-names>S</given-names> </name><name name-style="western"><surname>Dexheimer</surname><given-names>JW</given-names> </name><etal/></person-group><article-title>Automated clinical trial eligibility prescreening: increasing the efficiency of patient identification for clinical trials in the emergency department</article-title><source>J Am Med Inform Assoc</source><year>2015</year><month>01</month><volume>22</volume><issue>1</issue><fpage>166</fpage><lpage>178</lpage><pub-id pub-id-type="doi">10.1136/amiajnl-2014-002887</pub-id><pub-id pub-id-type="medline">25030032</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="web"><article-title>A study to compare two formulations of xylometazoline/dexpanthenol nasal spray for the treatment of nasal congestion</article-title><source>ClinicalTrials.gov</source><access-date>2025-05-19</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://clinicaltrials.gov/study/NCT03439436">https://clinicaltrials.gov/study/NCT03439436</ext-link></comment></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="web"><article-title>Open trial of biofeedback for respiratory symptoms</article-title><source>ClinicalTrials.gov</source><access-date>2025-05-19</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://clinicaltrials.gov/study/NCT05973513">https://clinicaltrials.gov/study/NCT05973513</ext-link></comment></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Gulden</surname><given-names>C</given-names> </name><name name-style="western"><surname>Mate</surname><given-names>S</given-names> </name><name name-style="western"><surname>Prokosch</surname><given-names>HU</given-names> </name><name name-style="western"><surname>Kraus</surname><given-names>S</given-names> </name></person-group><article-title>Investigating the capabilities of FHIR search for clinical trial phenotyping</article-title><source>Stud Health Technol Inform</source><year>2018</year><volume>253</volume><fpage>3</fpage><lpage>7</lpage><pub-id pub-id-type="medline">30147028</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Yarlas</surname><given-names>A</given-names> </name><name name-style="western"><surname>Maher</surname><given-names>S</given-names> </name><name name-style="western"><surname>Bayliss</surname><given-names>M</given-names> </name><etal/></person-group><article-title>The Inflammatory Bowel Disease Questionnaire in randomized controlled trials of treatment for ulcerative colitis: systematic review and meta-analysis</article-title><source>J Patient Cent Res Rev</source><year>2020</year><volume>7</volume><issue>2</issue><fpage>189</fpage><lpage>205</lpage><pub-id pub-id-type="doi">10.17294/2330-0698.1722</pub-id><pub-id pub-id-type="medline">32377552</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="web"><article-title>ICD-10-CM</article-title><source>Classification of Diseases, Functioning, and Disability</source><year>2024</year><access-date>2025-05-19</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.cdc.gov/nchs/icd/icd-10-cm/index.html">https://www.cdc.gov/nchs/icd/icd-10-cm/index.html</ext-link></comment></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Malden</surname><given-names>DE</given-names> </name><name name-style="western"><surname>Tartof</surname><given-names>SY</given-names> </name><name name-style="western"><surname>Ackerson</surname><given-names>BK</given-names> </name><etal/></person-group><article-title>Natural language processing for improved characterization of COVID-19 symptoms: observational study of 350,000 patients in a large integrated health care system</article-title><source>JMIR Public Health Surveill</source><year>2022</year><month>12</month><day>30</day><volume>8</volume><issue>12</issue><fpage>e41529</fpage><pub-id pub-id-type="doi">10.2196/41529</pub-id><pub-id pub-id-type="medline">36446133</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Crabb</surname><given-names>BT</given-names> </name><name name-style="western"><surname>Lyons</surname><given-names>A</given-names> </name><name name-style="western"><surname>Bale</surname><given-names>M</given-names> </name><etal/></person-group><article-title>Comparison of International Classification of Diseases and Related Health Problems, Tenth Revision codes with electronic medical records among patients with symptoms of coronavirus disease 2019</article-title><source>JAMA Netw Open</source><year>2020</year><month>08</month><day>3</day><volume>3</volume><issue>8</issue><fpage>e2017703</fpage><pub-id pub-id-type="doi">10.1001/jamanetworkopen.2020.17703</pub-id><pub-id pub-id-type="medline">32797176</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Koleck</surname><given-names>TA</given-names> </name><name name-style="western"><surname>Dreisbach</surname><given-names>C</given-names> </name><name name-style="western"><surname>Bourne</surname><given-names>PE</given-names> </name><name name-style="western"><surname>Bakken</surname><given-names>S</given-names> </name></person-group><article-title>Natural language processing of symptoms documented in free-text narratives of electronic health records: a systematic review</article-title><source>J Am Med Inform Assoc</source><year>2019</year><month>04</month><day>1</day><volume>26</volume><issue>4</issue><fpage>364</fpage><lpage>379</lpage><pub-id pub-id-type="doi">10.1093/jamia/ocy173</pub-id><pub-id pub-id-type="medline">30726935</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hardjojo</surname><given-names>A</given-names> </name><name name-style="western"><surname>Gunachandran</surname><given-names>A</given-names> </name><name name-style="western"><surname>Pang</surname><given-names>L</given-names> </name><etal/></person-group><article-title>Validation of a natural language processing algorithm for detecting infectious disease symptoms in primary care electronic medical records in Singapore</article-title><source>JMIR Med Inform</source><year>2018</year><month>06</month><day>11</day><volume>6</volume><issue>2</issue><fpage>e36</fpage><pub-id pub-id-type="doi">10.2196/medinform.8204</pub-id><pub-id pub-id-type="medline">29907560</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Karagounis</surname><given-names>S</given-names> </name><name name-style="western"><surname>Sarkar</surname><given-names>IN</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>ES</given-names> </name></person-group><article-title>Coding free-text chief complaints from a Health Information Exchange: a preliminary study</article-title><source>AMIA Annu Symp Proc</source><year>2020</year><volume>2020</volume><fpage>638</fpage><lpage>647</lpage><pub-id pub-id-type="medline">33936438</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zhou</surname><given-names>W</given-names> </name><name name-style="western"><surname>Dligach</surname><given-names>D</given-names> </name><name name-style="western"><surname>Afshar</surname><given-names>M</given-names> </name><name name-style="western"><surname>Gao</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Miller</surname><given-names>TA</given-names> </name></person-group><article-title>Improving the transferability of clinical note section classification models with BERT and large language model ensembles</article-title><source>Proc Conf Assoc Comput Linguist Meet</source><year>2023</year><month>07</month><volume>2023</volume><fpage>125</fpage><lpage>130</lpage><pub-id pub-id-type="medline">37786810</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Zhang</surname><given-names>F</given-names> </name><name name-style="western"><surname>Laish</surname><given-names>I</given-names> </name><name name-style="western"><surname>Benjamini</surname><given-names>A</given-names> </name><name name-style="western"><surname>Feder</surname><given-names>A</given-names> </name></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Lavelli</surname><given-names>A</given-names> </name><name name-style="western"><surname>Holderness</surname><given-names>E</given-names> </name><name name-style="western"><surname>Jimeno Yepes</surname><given-names>A</given-names> </name><name name-style="western"><surname>Minard</surname><given-names>AL</given-names> </name><name name-style="western"><surname>Pustejovsky</surname><given-names>J</given-names> </name><name name-style="western"><surname>Rinaldi</surname><given-names>F</given-names> </name></person-group><article-title>Section classification in clinical notes with multi-task transformers</article-title><source>Proceedings of the 13th International Workshop on Health Text Mining and Information Analysis (LOUHI)</source><year>2022</year><fpage>54</fpage><lpage>59</lpage><pub-id pub-id-type="doi">10.18653/v1/2022.louhi-1.7</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Gould</surname><given-names>DW</given-names> </name><name name-style="western"><surname>Walker</surname><given-names>D</given-names> </name><name name-style="western"><surname>Yoon</surname><given-names>PW</given-names> </name></person-group><article-title>The evolution of BioSense: lessons learned and future directions</article-title><source>Public Health Rep</source><year>2017</year><volume>132</volume><issue>1_suppl</issue><fpage>7S</fpage><lpage>11S</lpage><pub-id pub-id-type="doi">10.1177/0033354917706954</pub-id><pub-id pub-id-type="medline">28692386</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Reis</surname><given-names>BY</given-names> </name><name name-style="western"><surname>Kirby</surname><given-names>C</given-names> </name><name name-style="western"><surname>Hadden</surname><given-names>LE</given-names> </name><etal/></person-group><article-title>AEGIS: a robust and scalable real-time public health surveillance system</article-title><source>J Am Med Inform Assoc</source><year>2007</year><volume>14</volume><issue>5</issue><fpage>581</fpage><lpage>588</lpage><pub-id pub-id-type="doi">10.1197/jamia.M2342</pub-id><pub-id pub-id-type="medline">17600100</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>McMurry</surname><given-names>AJ</given-names> </name><name name-style="western"><surname>Gilbert</surname><given-names>CA</given-names> </name><name name-style="western"><surname>Reis</surname><given-names>BY</given-names> </name><name name-style="western"><surname>Chueh</surname><given-names>HC</given-names> </name><name name-style="western"><surname>Kohane</surname><given-names>IS</given-names> </name><name name-style="western"><surname>Mandl</surname><given-names>KD</given-names> </name></person-group><article-title>A self-scaling, distributed information architecture for public health, research, and clinical care</article-title><source>J Am Med Inform Assoc</source><year>2007</year><volume>14</volume><issue>4</issue><fpage>527</fpage><lpage>533</lpage><pub-id pub-id-type="doi">10.1197/jamia.M2371</pub-id><pub-id pub-id-type="medline">17460129</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Harkema</surname><given-names>H</given-names> </name><name name-style="western"><surname>Dowling</surname><given-names>JN</given-names> </name><name name-style="western"><surname>Thornblade</surname><given-names>T</given-names> </name><name name-style="western"><surname>Chapman</surname><given-names>WW</given-names> </name></person-group><article-title>ConText: an algorithm for determining negation, experiencer, and temporal status from clinical reports</article-title><source>J Biomed Inform</source><year>2009</year><month>10</month><volume>42</volume><issue>5</issue><fpage>839</fpage><lpage>851</lpage><pub-id pub-id-type="doi">10.1016/j.jbi.2009.05.002</pub-id><pub-id pub-id-type="medline">19435614</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lin</surname><given-names>C</given-names> </name><name name-style="western"><surname>Bethard</surname><given-names>S</given-names> </name><name name-style="western"><surname>Dligach</surname><given-names>D</given-names> </name><name name-style="western"><surname>Sadeque</surname><given-names>F</given-names> </name><name name-style="western"><surname>Savova</surname><given-names>G</given-names> </name><name name-style="western"><surname>Miller</surname><given-names>TA</given-names> </name></person-group><article-title>Does BERT need domain adaptation for clinical negation detection?</article-title><source>J Am Med Inform Assoc</source><year>2020</year><month>04</month><day>1</day><volume>27</volume><issue>4</issue><fpage>584</fpage><lpage>591</lpage><pub-id pub-id-type="doi">10.1093/jamia/ocaa001</pub-id><pub-id pub-id-type="medline">32044989</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Miller</surname><given-names>T</given-names> </name><name name-style="western"><surname>Bethard</surname><given-names>S</given-names> </name><name name-style="western"><surname>Dligach</surname><given-names>D</given-names> </name><name name-style="western"><surname>Savova</surname><given-names>G</given-names> </name></person-group><article-title>End-to-end clinical temporal information extraction with multi-head attention</article-title><source>Proc Conf Assoc Comput Linguist Meet</source><year>2023</year><month>07</month><volume>2023</volume><fpage>313</fpage><lpage>319</lpage><pub-id pub-id-type="medline">37780680</pub-id></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>H</given-names> </name><name name-style="western"><surname>Gao</surname><given-names>C</given-names> </name><name name-style="western"><surname>Dantona</surname><given-names>C</given-names> </name><name name-style="western"><surname>Hull</surname><given-names>B</given-names> </name><name name-style="western"><surname>Sun</surname><given-names>J</given-names> </name></person-group><article-title>DRG-LLaMA: tuning LLaMA model to predict diagnosis-related group for hospitalized patients</article-title><source>NPJ Digit Med</source><year>2024</year><month>01</month><day>22</day><volume>7</volume><issue>1</issue><fpage>16</fpage><pub-id pub-id-type="doi">10.1038/s41746-023-00989-3</pub-id><pub-id pub-id-type="medline">38253711</pub-id></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>He</surname><given-names>K</given-names> </name><name name-style="western"><surname>Mao</surname><given-names>R</given-names> </name><name name-style="western"><surname>Lin</surname><given-names>Q</given-names> </name><etal/></person-group><article-title>A survey of large language models for healthcare: from data, technology, and applications to accountability and ethics</article-title><source>Inform Fusion</source><year>2025</year><month>06</month><volume>118</volume><fpage>102963</fpage><pub-id pub-id-type="doi">10.1016/j.inffus.2025.102963</pub-id></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Workman</surname><given-names>TE</given-names> </name><name name-style="western"><surname>Ahmed</surname><given-names>A</given-names> </name><name name-style="western"><surname>Sheriff</surname><given-names>HM</given-names> </name><etal/></person-group><article-title>ChatGPT-4 extraction of heart failure symptoms and signs from electronic health records</article-title><source>Prog Cardiovasc Dis</source><year>2024</year><volume>87</volume><fpage>44</fpage><lpage>49</lpage><pub-id pub-id-type="doi">10.1016/j.pcad.2024.10.010</pub-id><pub-id pub-id-type="medline">39442600</pub-id></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Pugliese</surname><given-names>G</given-names> </name><name name-style="western"><surname>Maccari</surname><given-names>A</given-names> </name><name name-style="western"><surname>Felisati</surname><given-names>E</given-names> </name><etal/></person-group><article-title>Are artificial intelligence large language models a reliable tool for difficult differential diagnosis? An a posteriori analysis of a peculiar case of necrotizing otitis externa</article-title><source>Clin Case Rep</source><year>2023</year><month>09</month><volume>11</volume><issue>9</issue><fpage>e7933</fpage><pub-id pub-id-type="doi">10.1002/ccr3.7933</pub-id><pub-id pub-id-type="medline">37736475</pub-id></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Maillard</surname><given-names>A</given-names> </name><name name-style="western"><surname>Micheli</surname><given-names>G</given-names> </name><name name-style="western"><surname>Lefevre</surname><given-names>L</given-names> </name><etal/></person-group><article-title>Can chatbot artificial intelligence replace infectious diseases physicians in the management of bloodstream infections? A prospective cohort study</article-title><source>Clin Infect Dis</source><year>2024</year><month>04</month><day>10</day><volume>78</volume><issue>4</issue><fpage>825</fpage><lpage>832</lpage><pub-id pub-id-type="doi">10.1093/cid/ciad632</pub-id><pub-id pub-id-type="medline">37823416</pub-id></nlm-citation></ref><ref id="ref33"><label>33</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Nori</surname><given-names>H</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>YT</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>S</given-names> </name><etal/></person-group><article-title>Can generalist foundation models outcompete special-purpose tuning? Case study in medicine</article-title><source>arXiv</source><comment>Preprint posted online on  Nov 28, 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2311.16452</pub-id></nlm-citation></ref><ref id="ref34"><label>34</label><nlm-citation citation-type="web"><article-title>Smart-on-fhir/infectious-symptoms</article-title><source>GitHub</source><year>2025</year><access-date>2025-05-19</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://github.com/smart-on-fhir/infectious-symptoms-llm-study">https://github.com/smart-on-fhir/infectious-symptoms-llm-study</ext-link></comment></nlm-citation></ref><ref id="ref35"><label>35</label><nlm-citation citation-type="web"><article-title>Meta Llama 2</article-title><source>Meta Llama</source><access-date>2025-05-19</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://llama.meta.com/llama2/">https://llama.meta.com/llama2/</ext-link></comment></nlm-citation></ref><ref id="ref36"><label>36</label><nlm-citation citation-type="web"><article-title>Mixtral of experts</article-title><source>Mistral AI</source><year>2023</year><access-date>2025-05-19</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://mistral.ai/news/mixtral-of-experts/">https://mistral.ai/news/mixtral-of-experts/</ext-link></comment></nlm-citation></ref><ref id="ref37"><label>37</label><nlm-citation citation-type="web"><article-title>GPT-4</article-title><source>OpenAI</source><access-date>2025-05-19</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://openai.com/index/gpt-4-research/">https://openai.com/index/gpt-4-research/</ext-link></comment></nlm-citation></ref><ref id="ref38"><label>38</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Overhage</surname><given-names>JM</given-names> </name><name name-style="western"><surname>Kansky</surname><given-names>JP</given-names> </name></person-group><article-title>The Indiana Health Information Exchange</article-title><source>Health Information Exchange</source><year>2023</year><publisher-name>Elsevier</publisher-name><fpage>471</fpage><lpage>487</lpage><pub-id pub-id-type="doi">10.1016/B978-0-323-90802-3.00022-8</pub-id><pub-id pub-id-type="other">9780323908023</pub-id></nlm-citation></ref><ref id="ref39"><label>39</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Williams</surname><given-names>KS</given-names> </name><name name-style="western"><surname>Rahurkar</surname><given-names>S</given-names> </name><name name-style="western"><surname>Grannis</surname><given-names>SJ</given-names> </name><name name-style="western"><surname>Schleyer</surname><given-names>TK</given-names> </name><name name-style="western"><surname>Dixon</surname><given-names>BE</given-names> </name></person-group><article-title>Evolution of clinical Health Information Exchanges to population health resources: a case study of the Indiana network for patient care</article-title><source>BMC Med Inform Decis Mak</source><year>2025</year><month>02</month><day>24</day><volume>25</volume><issue>1</issue><fpage>97</fpage><pub-id pub-id-type="doi">10.1186/s12911-025-02933-9</pub-id><pub-id pub-id-type="medline">39994604</pub-id></nlm-citation></ref><ref id="ref40"><label>40</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Savova</surname><given-names>GK</given-names> </name><name name-style="western"><surname>Masanz</surname><given-names>JJ</given-names> </name><name name-style="western"><surname>Ogren</surname><given-names>PV</given-names> </name><etal/></person-group><article-title>Mayo clinical Text Analysis and Knowledge Extraction System (cTAKES): architecture, component evaluation and applications</article-title><source>J Am Med Inform Assoc</source><year>2010</year><volume>17</volume><issue>5</issue><fpage>507</fpage><lpage>513</lpage><pub-id pub-id-type="doi">10.1136/jamia.2009.001560</pub-id><pub-id pub-id-type="medline">20819853</pub-id></nlm-citation></ref><ref id="ref41"><label>41</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>McHugh</surname><given-names>ML</given-names> </name></person-group><article-title>Interrater reliability: the kappa statistic</article-title><source>Biochem Med (Zagreb)</source><year>2012</year><volume>22</volume><issue>3</issue><fpage>276</fpage><lpage>282</lpage><pub-id pub-id-type="medline">23092060</pub-id></nlm-citation></ref><ref id="ref42"><label>42</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hripcsak</surname><given-names>G</given-names> </name><name name-style="western"><surname>Rothschild</surname><given-names>AS</given-names> </name></person-group><article-title>Agreement, the f-measure, and reliability in information retrieval</article-title><source>J Am Med Inform Assoc</source><year>2005</year><volume>12</volume><issue>3</issue><fpage>296</fpage><lpage>298</lpage><pub-id pub-id-type="doi">10.1197/jamia.M1733</pub-id><pub-id pub-id-type="medline">15684123</pub-id></nlm-citation></ref><ref id="ref43"><label>43</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bodenreider</surname><given-names>O</given-names> </name></person-group><article-title>The Unified Medical Language System (UMLS): integrating biomedical terminology</article-title><source>Nucleic Acids Res</source><year>2004</year><month>01</month><day>1</day><volume>32</volume><issue>Database issue</issue><fpage>D267</fpage><lpage>70</lpage><pub-id pub-id-type="doi">10.1093/nar/gkh061</pub-id><pub-id pub-id-type="medline">14681409</pub-id></nlm-citation></ref><ref id="ref44"><label>44</label><nlm-citation citation-type="web"><source>ICD-10-CM</source><access-date>2025-05-19</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://icd10cmtool.cdc.gov/">https://icd10cmtool.cdc.gov/</ext-link></comment></nlm-citation></ref><ref id="ref45"><label>45</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>L</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>X</given-names> </name><name name-style="western"><surname>Deng</surname><given-names>X</given-names> </name><etal/></person-group><article-title>Prompt engineering in consistency and reliability with the evidence-based guideline for LLMs</article-title><source>NPJ Digit Med</source><year>2024</year><month>02</month><day>20</day><volume>7</volume><issue>1</issue><fpage>41</fpage><pub-id pub-id-type="doi">10.1038/s41746-024-01029-4</pub-id><pub-id pub-id-type="medline">38378899</pub-id></nlm-citation></ref><ref id="ref46"><label>46</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Nelson</surname><given-names>SJ</given-names> </name><name name-style="western"><surname>Yin</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Trujillo Rivera</surname><given-names>EA</given-names> </name><etal/></person-group><article-title>Are ICD codes reliable for observational studies? Assessing coding consistency for data quality</article-title><source>Digit Health</source><year>2024</year><volume>10</volume><fpage>20552076241297056</fpage><pub-id pub-id-type="doi">10.1177/20552076241297056</pub-id><pub-id pub-id-type="medline">39493629</pub-id></nlm-citation></ref><ref id="ref47"><label>47</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Mandl</surname><given-names>KD</given-names> </name><name name-style="western"><surname>Kohane</surname><given-names>IS</given-names> </name></person-group><article-title>Federalist principles for healthcare data networks</article-title><source>Nat Biotechnol</source><year>2015</year><month>04</month><volume>33</volume><issue>4</issue><fpage>360</fpage><lpage>363</lpage><pub-id pub-id-type="doi">10.1038/nbt.3180</pub-id><pub-id pub-id-type="medline">25850061</pub-id></nlm-citation></ref><ref id="ref48"><label>48</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>McMurry</surname><given-names>AJ</given-names> </name><name name-style="western"><surname>Gottlieb</surname><given-names>DI</given-names> </name><name name-style="western"><surname>Miller</surname><given-names>TA</given-names> </name><etal/></person-group><article-title>Cumulus: a federated electronic health record-based learning system powered by Fast Healthcare Interoperability Resources and artificial intelligence</article-title><source>J Am Med Inform Assoc</source><year>2024</year><month>08</month><day>1</day><volume>31</volume><issue>8</issue><fpage>1638</fpage><lpage>1647</lpage><pub-id pub-id-type="doi">10.1093/jamia/ocae130</pub-id><pub-id pub-id-type="medline">38860521</pub-id></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Kappa agreement scores are shown for human expert chart reviewers at 2 sites (BCH and IHIE) At BCH, a third reviewer (AG) was available for measurement. IHIE had 2 reviewers. BCH: Boston Children&#x2019;s Hospital; IHIE: Indiana Health Information Exchange.</p><media xlink:href="jmir_v27i1e72984_app1.xlsx" xlink:title="XLSX File, 145 KB"/></supplementary-material><supplementary-material id="app2"><label>Multimedia Appendix 2</label><p><italic>ICD-10</italic> codes for symptoms of infectious respiratory disease. <italic>ICD-10</italic>: <italic>International Classification of Diseases, Tenth Revision.</italic></p><media xlink:href="jmir_v27i1e72984_app2.xlsx" xlink:title="XLSX File, 11 KB"/></supplementary-material><supplementary-material id="app3"><label>Multimedia Appendix 3</label><p>Prompting templates and strategies, including verbatim prompting templates used across different large language models to conform to their instruction tuning specifications, as well as all 20 prompting strategies examined using our development corpus.</p><media xlink:href="jmir_v27i1e72984_app3.pdf" xlink:title="PDF File, 569 KB"/></supplementary-material><supplementary-material id="app4"><label>Multimedia Appendix 4</label><p>Patient demographics across sites and corpora. Demographics reported include binned age groups, administrative sex, and patient-reported race.</p><media xlink:href="jmir_v27i1e72984_app4.xlsx" xlink:title="XLSX File, 209 KB"/></supplementary-material><supplementary-material id="app5"><label>Multimedia Appendix 5</label><p>Frequency of suspected symptoms at the time of corpus construction. To ensure decent distribution of symptoms across each corpus, samples were based on cTAKES-annotated symptom mentions. This aimed to guarantee that, even for rare symptoms, a bare minimum of symptoms likely to be positive was included in all corpora.</p><media xlink:href="jmir_v27i1e72984_app5.xlsx" xlink:title="XLSX File, 139 KB"/></supplementary-material><supplementary-material id="app6"><label>Multimedia Appendix 6</label><p>Large language model (LLM) symptom identification performance using the development corpus. <italic>F</italic><sub>1</sub>-scores are provided for all 80 combinations of models and strategies. Detailed performance results are provided for each LLM using their best performing LLM strategy.</p><media xlink:href="jmir_v27i1e72984_app6.xlsx" xlink:title="XLSX File, 106 KB"/></supplementary-material><supplementary-material id="app7"><label>Multimedia Appendix 7</label><p>Symptom identification performance using the test corpus from BCH and the best strategy identified for each LLM. Metrics include <italic>F</italic><sub>1</sub>-score, sensitivity, specificity, positive predictive value, negative predictive value, as well as raw counts of true positives, false negatives, true negatives, and false positives across all symptoms individually and aggregated. McNemar significance tests compare <italic>ICD-10</italic>&#x2013;based symptom identification to LLM-based symptom identification. BCH: Boston Children&#x2019;s Hospital; <italic>ICD-10</italic>: <italic>International Classification of Diseases, Tenth Revision</italic>; LLM: large language model.</p><media xlink:href="jmir_v27i1e72984_app7.xlsx" xlink:title="XLSX File, 227 KB"/></supplementary-material><supplementary-material id="app8"><label>Multimedia Appendix 8</label><p>LLM symptom identification performance using the validation corpus. Sheets provided show detailed results for GPT-4 and <italic>ICD-10</italic> (including performance metrics and raw counts) as well as tables comparing the performance of both the validation and test corpora. McNemar significance tests compare <italic>ICD-10</italic>&#x2013;based symptom identification to LLM-based symptom identification. <italic>ICD-10</italic>: <italic>International Classification of Diseases, Tenth Revision</italic>; LLM: large language model.</p><media xlink:href="jmir_v27i1e72984_app8.xlsx" xlink:title="XLSX File, 149 KB"/></supplementary-material></app-group></back></article>