<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">J Med Internet Res</journal-id><journal-id journal-id-type="publisher-id">jmir</journal-id><journal-id journal-id-type="index">1</journal-id><journal-title>Journal of Medical Internet Research</journal-title><abbrev-journal-title>J Med Internet Res</abbrev-journal-title><issn pub-type="epub">1438-8871</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v27i1e74094</article-id><article-id pub-id-type="doi">10.2196/74094</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Assessing Large Language Models in Building a Structured Dataset From AskDocs Subreddit Data: Methodological Study</article-title></title-group><contrib-group><contrib contrib-type="author" corresp="yes" equal-contrib="yes"><name name-style="western"><surname>Snell</surname><given-names>Quinn</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib><contrib contrib-type="author" equal-contrib="yes"><name name-style="western"><surname>Westhoff</surname><given-names>Chase</given-names></name><degrees>MS</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib><contrib contrib-type="author" equal-contrib="yes"><name name-style="western"><surname>Westhoff</surname><given-names>John</given-names></name><degrees>MPH, MD</degrees><xref ref-type="aff" rid="aff2">2</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib><contrib contrib-type="author" equal-contrib="yes"><name name-style="western"><surname>Low</surname><given-names>Ethan</given-names></name><degrees>MS</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib><contrib contrib-type="author" equal-contrib="yes"><name name-style="western"><surname>Hanson</surname><given-names>Carl L</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib><contrib contrib-type="author" equal-contrib="yes"><name name-style="western"><surname>Tass</surname><given-names>E Shannon Neeley</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib></contrib-group><aff id="aff1"><institution>Brigham Young University</institution><addr-line>3361 TMCB</addr-line><addr-line>Provo</addr-line><addr-line>UT</addr-line><country>United States</country></aff><aff id="aff2"><institution>University of Nevada, Reno</institution><addr-line>Reno</addr-line><addr-line>NV</addr-line><country>United States</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Sarvestan</surname><given-names>Javad</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Onah</surname><given-names>Chibuzo</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Ewelu</surname><given-names>Stephanie</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Kath</surname><given-names>Suraj</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Quinn Snell, PhD, Brigham Young University, 3361 TMCB, Provo, UT, 84602, United States, 1 8014225098; <email>snell@cs.byu.edu</email></corresp><fn fn-type="equal" id="equal-contrib1"><label>*</label><p>all authors contributed equally</p></fn></author-notes><pub-date pub-type="collection"><year>2025</year></pub-date><pub-date pub-type="epub"><day>22</day><month>10</month><year>2025</year></pub-date><volume>27</volume><elocation-id>e74094</elocation-id><history><date date-type="received"><day>17</day><month>03</month><year>2025</year></date><date date-type="rev-recd"><day>03</day><month>09</month><year>2025</year></date><date date-type="accepted"><day>05</day><month>09</month><year>2025</year></date></history><copyright-statement>&#x00A9; Quinn Snell, Chase Westhoff, John Westhoff, Ethan Low, Carl L Hanson, E Shannon Neeley Tass. Originally published in the Journal of Medical Internet Research (<ext-link ext-link-type="uri" xlink:href="https://www.jmir.org">https://www.jmir.org</ext-link>), 22.10.2025. </copyright-statement><copyright-year>2025</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in the Journal of Medical Internet Research (ISSN 1438-8871), is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://www.jmir.org/">https://www.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://www.jmir.org/2025/1/e74094"/><abstract><sec><title>Background</title><p>In an era marked by a growing reliance on digital platforms for health care consultation, the subreddit r/AskDocs has emerged as a pivotal forum. However, the vast, unstructured nature of forum data presents a formidable challenge; the extraction and meaningful analysis of such data require advanced tools that can navigate the complexities of language and context inherent in user-generated content. The emergence of large language models (LLMs) offers new tools for the extraction of health-related content from unstructured text found in social media platforms such as Reddit.</p></sec><sec><title>Objective</title><p>This methodological study aimed to evaluate the use of LLMs to systematically transform the rich, unstructured textual data from the AskDocs subreddit into a structured dataset, an approach that aligns more closely with human cognitive processes than traditional data extraction methods.</p></sec><sec sec-type="methods"><title>Methods</title><p>Human annotators and LLMs were used to extract data from 2800 randomly sampled r/AskDocs subreddit posts. For human annotation, at least 2 medical students extracted demographic information, type of inquiry (diagnosis, symptom, or treatment), proxy relationship, chronic condition, health care consultation status, and primary focus topic. For LLM data extraction, specially engineered prompts were created using JavaScript Object Notation and few-shot prompting. Prompts were used to query several state-of-the-art LLMs (eg, Llama 3, Genna, and GPT). Cohen &#x03BA; was calculated across all human annotators, with this dataset serving as the gold standard for comparison with LLM data extraction. A high degree of human annotator reliability was observed for the coding of demographic information. Lower reliability was seen in coding the health-related content of the posts.</p></sec><sec sec-type="results"><title>Results</title><p>The highest performance scores compared with the gold standard were achieved by Llama 3 70B with 7 few-shot prompt examples (average accuracy=87.4) and GPT-4 with 2 few-shot prompt examples (average accuracy=87.4). Llama 3 70B excelled in coding health-related content while GPT-4 performed better coding demographic content from unstructured posts.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>LLMs performed comparably with human annotators in extracting demographic and health-related information from the AskDocs subreddit unstructured posts. This study validates the use of LLMs for analyzing digital health care communications and highlights their potential as reliable tools for understanding online behaviors and interactions, shifting toward more sophisticated methodologies in digital research and practice.</p></sec></abstract><kwd-group><kwd>large language models</kwd><kwd>artificial intelligence</kwd><kwd>Reddit</kwd><kwd>unstructured text analysis</kwd><kwd>data extraction</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><sec id="s1-1"><title>Background</title><p>The advancement of digital health care, especially highlighted during the COVID-19 pandemic, has significantly increased the reliance on online platforms for medical consultation and advice, profoundly changing how individuals seek medical advice over the last 2 decades [<xref ref-type="bibr" rid="ref1">1</xref>,<xref ref-type="bibr" rid="ref2">2</xref>]. With a growing focus on platforms such as Reddit for &#x201C;Ask the Doctor&#x201D; services, Reddit&#x2019;s r/AskDocs subreddit has become a vital forum for such interactions, growing to more than 550,000 subscribers by January 2024 [<xref ref-type="bibr" rid="ref3">3</xref>]. This increase, marked by a surge in user engagement starting in 2018, exemplifies the evolving role of social media as a trusted source for medical advice on the web. Recent studies on social media platforms such as Reddit have highlighted active user engagement in health-related discussions, such as medication abortion [<xref ref-type="bibr" rid="ref4">4</xref>] and dermatology [<xref ref-type="bibr" rid="ref5">5</xref>]. The r/AskDocs subreddit has been a focal point for analyzing user demographics and health topic trends, marked by a dramatic increase in user posts over time [<xref ref-type="bibr" rid="ref3">3</xref>]. This trend toward asynchronous health care, where individuals engage with health care professionals and peers over digital platforms, underscores a shift in how medical advice is sought and dispensed in the modern era.</p><p>The potential to harness insights from these forums is immense, offering a unique window into patient concerns, misconceptions, and the public&#x2019;s health-seeking behaviors. However, the vast, unstructured nature of forum data presents a formidable challenge; the extraction and meaningful analysis of such data require advanced tools that can navigate the complexities of language and context inherent in user-generated content. Recently, large language models (LLMs) have been used to extract data from unstructured text. These models can navigate much of the nuances of the languages in forums and social media to extract usable data at what appears to be similar to human levels. Understanding the strengths and the limitations of the use of LLMs as compared with humans for data extraction is the focus of this research.</p></sec><sec id="s1-2"><title>Traditional Methods of Information Extraction From Text</title><p>Regular expressions (often shortened to regex), a staple in text processing, and many other traditional natural language processing tools offer rule-based approaches to identifying specific patterns within text. For example, regex can be used to locate all instances of email addresses or phone numbers within a database by defining the patterns that match email addresses and phone numbers. In the realm of data extraction from web-based health forums such as the AskDocs subreddit, the intricacy of human language and the unstructured nature of user submissions present significant challenges. Users frequently provide a wealth of information, albeit in varied formats that defy simple pattern matching. The diversity in the presentation of these data complicates the task of developing regular expressions that can accurately and consistently extract the desired information [<xref ref-type="bibr" rid="ref6">6</xref>].</p><p>To illustrate the complexity of this task, consider the following examples that represent patterns seen in posts on the AskDocs subreddit (<xref ref-type="table" rid="table1">Table 1</xref>). These examples highlight the variability and nuanced nature of the information provided by users, underscoring the difficulties in crafting regex patterns capable of effectively parsing and categorizing these data.</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Examples of AskDocs subreddit user submissions and their complexity.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Text</td><td align="left" valign="bottom">Explanation</td></tr></thead><tbody><tr><td align="left" valign="top">&#x201C;I&#x2019;m worried about a white lump on my elbow. Age: 31; Race: Japanese; Sex: Male&#x201D;</td><td align="left" valign="top">The text clearly states their age and race but uses &#x201C;white&#x201D; first in a medical context, not as a race.</td></tr><tr><td align="left" valign="top">&#x201C;Mid-30s F here, experiencing severe headaches. Also, I&#x2019;m 5 foot 6&#x201D; and around 60-ish kg.&#x201D;</td><td align="left" valign="top">Use of approximate age (&#x201C;Mid-30s&#x201D;) and nonstandard expressions for measurements (&#x201C;5 foot 6&#x201D; and &#x201C;60-ish&#x201D;).</td></tr><tr><td align="left" valign="top">&#x201C;I&#x2019;m a minor, dealing with severe migraines especially during my period. 150 cm, 60 kg&#x201D;</td><td align="left" valign="top">The text implies the user&#x2019;s sex through the mention of a menstrual cycle but does not explicitly state it.</td></tr></tbody></table></table-wrap><p>These examples underscore the inherent challenges in using regular expressions for data extraction from AskDocs subreddit. The variability in how users report their demographic information, combined with the use of language in medical contexts, necessitates a highly sophisticated system using a variety of regex patterns. Building such a system is not only daunting but also entirely impractical.</p></sec><sec id="s1-3"><title>Large Language Models</title><p>In contrast, artificial intelligence (AI) and particularly LLMs introduce a paradigm shift in data extraction. LLMs trained on vast corpora of text demonstrate an understanding of language nuances, contextual meaning, and the implicit cues embedded within text. This enables LLMs to interpret and categorize complex information without the need for explicitly defined rules, as is the case with regex [<xref ref-type="bibr" rid="ref7">7</xref>].</p><p>The integration of LLMs in analyzing web-based health forums represents a significant advancement in this field. LLMs trained on extensive datasets have proven effective in understanding and generating human-like text. For instance, the GatorTron model, a large clinical transformer model, demonstrated remarkable performance in extracting and using patient information from clinical narratives [<xref ref-type="bibr" rid="ref8">8</xref>]. Furthermore, the effectiveness of LLMs in preserving privacy while extracting information highlights their growing importance in sensitive domains such as health care [<xref ref-type="bibr" rid="ref9">9</xref>].</p><p>In addition to their general capabilities, LLMs have shown proficiency in tasks such as information extraction, categorizing text data, and identifying sentiment in complex and unstructured data settings [<xref ref-type="bibr" rid="ref10">10</xref>]. Another study showed how fine-tuning LLMs such as GPT-3 can accurately extract complex scientific knowledge [<xref ref-type="bibr" rid="ref11">11</xref>]. This makes them highly suitable for extracting nuanced information from health-related discussions on the web. In summary, the existing research lays a comprehensive foundation for understanding web-based health-seeking behavior, with LLMs playing a crucial role in advancing the understanding and analysis capabilities in digital health communication.</p></sec><sec id="s1-4"><title>Research Question and Aim</title><p>Amidst this backdrop, this study evaluates the methodology of using LLMs to transform the unstructured, information-rich text data from sources such as the AskDocs subreddit into a structured dataset. Unlike traditional data extraction methods, which struggle with the variability and complexity of natural language, LLMs offer a context-aware, nuanced approach that more closely aligns with human cognitive processes. The following research question seeks to evaluate the methodology&#x2019;s effectiveness and explore its broader applications: How does the accuracy and agreement of LLMs in labeling web-based health communication compare with human annotators in extracting and categorizing complex information from social media datasets such as the AskDocs subreddit?</p><p>This study focuses on the methodology and its validation, aiming not only to demonstrate the feasibility of using LLMs for analyzing web-based health communication but also to explore their broader implications for digital research and practice. To demonstrate the method&#x2019;s use, a brief analysis of the extracted dataset was conducted to illustrate future use cases. This approach seeks to uncover new avenues for understanding web-based behaviors and interactions.</p><p>While this research focuses on extracting structured datasets from health forum data, the applicability of LLMs extends far beyond this realm. Their versatility and advanced understanding of natural language may make them suitable for various fields requiring data extraction and analysis. These fields could include legal document analysis [<xref ref-type="bibr" rid="ref12">12</xref>], financial report summarization, and sentiment analysis in social media [<xref ref-type="bibr" rid="ref13">13</xref>-<xref ref-type="bibr" rid="ref15">15</xref>]. LLMs offer a powerful tool for transforming unstructured text into actionable insights. This broad applicability underscores the transformative potential of LLMs across multiple sectors, promising to revolutionize data analysis and knowledge extraction in an array of disciplines.</p></sec></sec><sec id="s2" sec-type="methods"><title>Methods</title><p>This section outlines the processes for data collection, human and LLM labeling, and comparison of label similarity between human annotators and LLMs, as outlined in <xref ref-type="fig" rid="figure1">Figure 1</xref>. The following sections describe each component of the flowchart in detail.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Flowchart of methods used for this study. LLM: large language model.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v27i1e74094_fig01.png"/></fig><sec id="s2-1"><title>Data Collection</title><p>The AskDocs subreddit is part of Reddit, which hosts more than 13 billion posts across more than 100,000 subreddits, engaging more than 50 million daily users [<xref ref-type="bibr" rid="ref16">16</xref>]. Due to limitations in Reddit&#x2019;s official application programming interface (API) (PRAW) for retrieving historical data, the Pushshift API [<xref ref-type="bibr" rid="ref17">17</xref>], a third-party archive of Reddit&#x2019;s post metadata, was used. Despite known gaps in Pushshift&#x2019;s data coverage, it remains a valuable tool for accessing large volumes of Reddit data, including the AskDocs subreddit.</p><p>Data were extracted from AskDocs subreddit posts spanning from inception in July 2013 to October 2022, comprising 1,016,229 posts and 2,122,081 comments. While acknowledging the challenges in obtaining a complete dataset, the available data were substantial for extracting insights and addressing the research objectives of this study. A random sample of 2800 AskDocs subreddit posts was used for comparing data extraction using human labeling and LLM labeling. A different sample of approximately 30,000 AskDocs subreddit posts was used to demonstrate the LLM data retrieval methodology.</p></sec><sec id="s2-2"><title>Human-Labeling Process</title><p>Human labeling for extracting and categorizing information from AskDocs subreddit posts was essential for this study for multiple reasons. First, it provided a method for creating the &#x201C;gold standard&#x201D; dataset used for evaluating the accuracy and reliability of LLMs relative to human-level understanding and judgment. Second, the comparison of LLM annotation against the gold standard, and of individual human annotators against the gold standard, sheds light on the potential of LLMs to augment human efforts in terms of efficiency, scalability, and consistency. Understanding where LLMs excel or fall short compared with human annotators allows for better utilization of capabilities, identification of areas for improvement, and refinement of methodologies to enhance their performance in real-world applications.</p><p>In collaboration with the University of Nevada, Reno Medical School, 27 medical students served as human annotators for data labeling. The students were tasked with categorizing the posts according to the extraction criteria outlined in <xref ref-type="table" rid="table2">Table 2</xref>, which had been reviewed and approved by a physician consultant (JW) to ensure clinical relevance and accuracy. Each medical student was required to read the training material, which outlined the task, explained all the data fields and value options, and had several real Reddit post examples with justifications for each chosen label for that post. The training document, found in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>, was designed to address the various labeling challenges annotators might encounter. These labeling guidelines were followed by each annotator and specifically referred to when resolving differences. While some labeling categories are inherently subjective, they were specifically included to distinguish the kinds of questions users were asking and shed light on the capabilities of LLMs for extracting subjective data.</p><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Information fields and possible responses for human and large language model data extraction from the AskDocs subreddit posts.</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Field</td><td align="left" valign="bottom">Options</td></tr></thead><tbody><tr><td align="left" valign="top">Biological Sex</td><td align="left" valign="top">M, F, Unknown, N/A<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup></td></tr><tr><td align="left" valign="top">Gender Identity</td><td align="left" valign="top">M, F, Other, N/A</td></tr><tr><td align="left" valign="top">Age</td><td align="left" valign="top">A numerical value, Unknown, N/A</td></tr><tr><td align="left" valign="top">Height</td><td align="left" valign="top">Height in formats such as 6&#x2019;0&#x201D; or 170 cm, Unknown, N/A</td></tr><tr><td align="left" valign="top">Height Units</td><td align="left" valign="top">Feet/in, cm, m, N/A</td></tr><tr><td align="left" valign="top">Weight</td><td align="left" valign="top">A numerical value, Unknown, N/A</td></tr><tr><td align="left" valign="top">Weight Units</td><td align="left" valign="top">lbs, kg, N/A</td></tr><tr><td align="left" valign="top">Race</td><td align="left" valign="top">White, Asian, Black, Hispanic, Other, Unknown</td></tr><tr><td align="left" valign="top">Diagnosis-Based Medical Inquiry</td><td align="left" valign="top">True, False, N/A</td></tr><tr><td align="left" valign="top">Symptom-Based Medical Inquiry</td><td align="left" valign="top">True, False, N/A</td></tr><tr><td align="left" valign="top">Treatment-Based Medical Inquiry</td><td align="left" valign="top">True, False, N/A</td></tr><tr><td align="left" valign="top">Proxy Relationship</td><td align="left" valign="top">N/A, significant other, friend, child, other</td></tr><tr><td align="left" valign="top">Chronic Condition</td><td align="left" valign="top">True, False, N/A</td></tr><tr><td align="left" valign="top">Healthcare Consultation Status</td><td align="left" valign="top">Preconsultation, in consultation, postconsultation, N/A</td></tr><tr><td align="left" valign="top">Primary Focus Topic</td><td align="left" valign="top">Multiple possible topics</td></tr></tbody></table><table-wrap-foot><fn id="table2fn1"><p><sup>a</sup>N/A: not available or not applicable. </p></fn></table-wrap-foot></table-wrap><p>The sample size was based on each medical student&#x2019;s 10-hour time availability. It was assumed that each student annotator could categorize the extraction fields for 1 AskDocs subreddit post in an average of 1.5 minutes, which would yield 400 labeled posts over a 10-hour period. The research team then randomly selected 3600 posts and organized them into 9 batches of 400 posts each to provide a comprehensive snapshot of the prevalent dialogues on the platform. Each batch was then reviewed by 3 medical student annotators to ensure thorough examination of each post from multiple perspectives.</p><p>The execution of this plan encountered practical challenges. Despite initial aspirations, only 2 of the 9 batches saw the completion of labeling by all 3 assigned annotators. Five batches had the contribution of 2 students; unfortunately, 2 batches were labeled by a single student. To maintain quality, the team ensured that each post was reviewed by at least 2 individuals to guarantee robustness and reliability of the human-labeled dataset; the decision was made to exclude the batches that were reviewed only by 1 student. As a result, the final dataset comprised 2800 posts, each labeled by at least 2 medical students.</p><p>The gold standard dataset was based on the majority response from the human annotations. In cases of disagreement, the lead researcher&#x2014;who created the labeling guidelines and followed them closely&#x2014;resolved any disagreement using the same criteria provided in the annotator guidelines. This approach ensured consistency in applying the guidelines across all labeled data. This dataset then served as the gold standard for a comprehensive comparison with the data extraction by various LLMs.</p></sec><sec id="s2-3"><title>Human Interannotator Agreement</title><p>Cohen &#x03BA; score, a statistical measure for evaluating the level of agreement between 2 or more annotators, was used to assess interannotator reliability [<xref ref-type="bibr" rid="ref18">18</xref>]. This measure accounts for the possibility of chance agreement in its calculation, making it a more robust indicator of interannotator reliability than simple percentage agreement. The magnitude of the Cohen &#x03BA; score indicates the level of agreement, with a score of 1 corresponding to perfect agreement. There are different ways to categorize agreement level using the Cohen &#x03BA; score. One such categorization proposed by McHugh [<xref ref-type="bibr" rid="ref19">19</xref>] is shown in <xref ref-type="table" rid="table3">Table 3</xref>.</p><table-wrap id="t3" position="float"><label>Table 3.</label><caption><p>Interpretation of Cohen &#x03BA; as proposed by McHugh [<xref ref-type="bibr" rid="ref19">19</xref>].</p></caption><table id="table3" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">&#x03BA; Score</td><td align="left" valign="bottom">Agreement level</td></tr></thead><tbody><tr><td align="char" char="." valign="top">0&#x2010;0.2</td><td align="left" valign="top">Almost none</td></tr><tr><td align="char" char="." valign="top">0.21&#x2010;0.39</td><td align="left" valign="top">Minimal</td></tr><tr><td align="char" char="." valign="top">0.40&#x2010;0.59</td><td align="left" valign="top">Weak</td></tr><tr><td align="char" char="." valign="top">0.60&#x2010;0.79</td><td align="left" valign="top">Moderate</td></tr><tr><td align="char" char="." valign="top">0.80&#x2010;0.90</td><td align="left" valign="top">Strong</td></tr><tr><td align="left" valign="top">Above 0.90</td><td align="left" valign="top">Almost perfect</td></tr></tbody></table></table-wrap><p>In the realm of data labeling, particularly for qualitative data with subjective categories, the Cohen &#x03BA; score is a standard metric for measuring annotator consistency. Interannotator agreement with a Cohen &#x03BA; score is illustrated in the matrix, showcasing the degree of consensus among different pairs of human annotators on various categories.</p><p>Human labeling was essential to this study, as it produced the gold standard dataset needed to evaluate LLM performance against human-level comprehension. The Cohen &#x03BA; scores derived from the human annotators shed light on an inherent aspect of human-mediated data labeling: diversity in human judgment. While high levels of agreement in some categories (left side of <xref ref-type="fig" rid="figure2">Figure 2</xref>) validate the clarity of our guidelines, the variability in others (right side of <xref ref-type="fig" rid="figure2">Figure 2</xref>) reflects the natural divergence in human interpretation. This phenomenon serves as a crucial reminder that disagreements between the gold standard dataset and LLMs&#x2019; generation of labels do not inherently signify an error on the LLM&#x2019;s part; rather, they may simply reflect the educated guesses that humans often make in the face of ambiguity. Crucially, the variability in human annotations, as visualized in <xref ref-type="fig" rid="figure3">Figure 3</xref>, is the reason multiple human annotators are necessary. The gold standard is created as an aggregate of the human annotations.</p><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>Detailed performance comparison of large language models in labeling AskDocs posts across various categories.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v27i1e74094_fig02.png"/></fig><fig position="float" id="figure3"><label>Figure 3.</label><caption><p>Cohen &#x03BA; matrix displaying the agreement between different pairs of human annotators across the different categories of extraction. A higher &#x03BA; score indicates a stronger agreement.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v27i1e74094_fig03.png"/></fig></sec><sec id="s2-4"><title>High Disagreement Fields</title><p>As shown in <xref ref-type="fig" rid="figure3">Figure 3</xref>, the target questions exhibited varying levels of difficulty even for human annotators. The topics are categorized as Treatment-Based, Diagnosis-Based, and Symptom-Based. These fields aim to determine the reason for an individual&#x2019;s post. Treatment-Based questions pertain to discussions about treatments, Diagnosis-Based questions relate to diagnoses, and Symptom-Based questions focus on symptoms. However, these categories can sometimes be ambiguous, as demonstrated by the following post:</p><disp-quote><p>Dear doctors, I currently have a very minor case of poison ivy than that of my cases in the past; my girlfriend although believes it is contagious and refuses to make any contact with me (eg, Hold my hand). Is poison ivy contagious? I am a male, 18, and have had poison ivy for about 4 days now.</p></disp-quote><p>In this instance, one annotator labeled the post as Diagnosis-Based, likely due to the mention of a poison ivy case, while the other 2 annotators classified it as Symptom-based, focusing on the symptoms of poison ivy.</p><p>Other challenging questions include those related to Chronic Conditions, Proxy Relationships, and Healthcare Consultation Status. Chronic Conditions questions inquire whether the issue is ongoing, Proxy Relationship questions address the poster&#x2019;s relationship to the person with the issue, and Healthcare Consultation Status questions indicate Preconsultation, In Consultation, Postconsultation, depending on whether the individual has not yet seen a doctor, is currently seeing one, or has already done so. The following three items are examples of posts that led to disagreements among annotators:</p><list list-type="order"><list-item><p>&#x201C;F23 test results came back and suggest a possible cyst. Should I pursue treatment? Here are the results. I originally went to the doc for lower back and side pain&#x201D; (Consultation Status)</p></list-item><list-item><p>&#x201C;Are these bug bites, and if so, should I be concerned? Here are some pictures of the bumps in question. The pictures are two weeks old, and I still have them.&#x201D; (Chronic Condition)</p></list-item><list-item><p>&#x201C;Do children need antibiotics for a UTI? Writing about a 7F. Has been needing to urinate per every few minutes. Luckily no pain when urinating and no blood either. Can this go away with cranberry juice and lots of water or are antibiotics needed?&#x201D; (Proxy Relationship)</p></list-item></list><p>As an indicator of the LLM&#x2019;s effectiveness, we use accuracy against the gold standard. Given the low agreement among human annotators on these topics, similar disagreements from LLMs reflect the inherent ambiguity within the dataset.</p></sec><sec id="s2-5"><title>LLM Labeling Process</title><p>To extract the structured data from the unstructured AskDocs subreddit, prompt engineering was used with the LLMs evaluated in this study. Prompt engineering is a critical process in the application of LLMs. It involves the careful design of prompts or instructions that guide the model in understanding and performing the desired task. The significance of prompt engineering lies in its ability to leverage the model&#x2019;s inherent capabilities by translating the task at hand into a format that the model can comprehend and execute effectively, increasing the probability of receiving a correct response in the desired format. In this study, the techniques used in engineering prompts for this task were JavaScript Object Notation (JSON) fields formatting and few-shot prompting.</p></sec><sec id="s2-6"><title>JSON Fields Formatting</title><p>To comprehensively capture information shared by AskDocs subreddit users, a set of fields was defined within a JSON structure, a common format for representing data in a clear and accessible manner. Each field was designed to hold specific types of information, with permissible values outlined to ensure consistency and accuracy in the extracted data. The structured nature of JSON facilitated the straightforward combination of these fields into a cohesive dataset, where each post was transformed into a structured object. The same fields and values used by the human annotators were applied to the LLM outputs (<xref ref-type="table" rid="table2">Table 2</xref>).</p></sec><sec id="s2-7"><title>Few-Shot Prompting</title><p>Few-shot prompting with LLMs is an approach designed to enhance the models&#x2019; ability to perform specific tasks. Few-shot prompting involves creating a prompt that includes several examples of the task at hand (often in a Q/A or Input/Output pair format), followed by the new task to be completed. This technique effectively &#x201C;primes&#x201D; the model by providing it with a few examples of how to complete a specific task, thereby improving its ability to understand and execute similar tasks with new data. LLMs can perform few-shot prompting without fine-tuning, and Brown [<xref ref-type="bibr" rid="ref20">20</xref>] showed that LLMs can perform numerous natural language processing tasks when provided a few examples in its prompt.</p><p>For instance, if the task involves extracting demographic information from unstructured health forum posts, a few-shot prompt might include examples as follows:</p><disp-quote><p>Example 1</p><p>Input: <italic>I&#x2019;m a 34-year-old male experiencing frequent headaches</italic>.</p><p>Output: Age: 34, Gender: Male, Concern: frequent headaches.</p><p>Example 2</p><p>Input: <italic>Female, 29, noticing a rash that appeared last week</italic>.</p><p>Output: Age: 29, Gender: Female, Concern: rash appeared last week.</p></disp-quote><p>After presenting a few such examples, the model is then given a new, unseen piece of text and asked to perform the same task. This method capitalizes on the LLM&#x2019;s ability to discern patterns and apply the learned extraction process to new data, enabling more accurate identification and categorization of information. Few-shot prompting thus represents a powerful tool in the prompt engineering tool kit, significantly enhancing the LLM&#x2019;s use for data extraction [<xref ref-type="bibr" rid="ref21">21</xref>]. This technique is crucial for several reasons:</p><list list-type="bullet"><list-item><p>Flexibility: Few-shot prompting allows models to adapt quickly to new tasks without extensive model fine-tuning. Traditionally, adapting a model to perform a new or specific task necessitates a substantial investment in data labeling and computational resources for training, often making the process cost-prohibitive and time-consuming. Few-shot prompting, however, leverages the preexisting knowledge and versatility of LLMs, enabling them to understand and execute tasks with just a handful of examples.</p></list-item><list-item><p>Consistency: Few-shot prompting helps in standardizing the output format, improving the odds that the LLM generates data in the defined JSON structure.</p></list-item><list-item><p>Accuracy in information extraction: Previous work has shown that few-shot prompting of LLMs has the potential to drastically increase accuracy across a multitude of tasks of varying complexity.</p></list-item><list-item><p>Proper JSON field creation: Although recent advancements allow enforcing JSON formatting through both OpenAI APIs and locally hosted models, these methods do not guarantee the generation of JSON objects with the correct fields. Few-shot prompting addresses this limitation by explicitly illustrating how each field should be populated, encouraging the model to produce objects with the appropriate field &#x201C;keys.&#x201D;</p></list-item></list><p>The prompt structure used in this study included (1) a brief introduction to the task, clarifying the goal of converting unstructured text into structured JSON format; (2) detailed instructions on how to approach the analysis, specifying the information that needs to be extracted and how it should be categorized into the JSON fields; and (3) examples to illustrate the labeling process, serving as templates for the LLM to follow. These examples demonstrate how to fill out each JSON field based on the content of the posts, ensuring clarity and precision in the output.</p><p>In this study, the researchers experimented with both 2-shot and 7-shot prompting techniques. Using only 2 or even 7 examples may seem limited; however, this is a defining feature of few-shot prompting. The way the prompt is crafted can significantly influence the results. Zhao et al [<xref ref-type="bibr" rid="ref22">22</xref>] demonstrated that while increasing the number of examples can improve the accuracy of results, the gains tend to diminish as more examples are added. In addition, LLMs are susceptible to majority label bias, where the output is biased toward labels that are more frequent in the prompt. To mitigate this bias, examples were carefully selected that included a diverse range of labels, minimizing repetition wherever possible. All examples were taken from the training set and chosen prior to evaluation to avoid any overlap with test data. Labels were selected based on practicality, prioritizing diversity of content while keeping the full prompt within the model&#x2019;s size limit.</p><p>By adopting few-shot prompting, the aim was to leverage the LLMs&#x2019; capabilities for consistent and accurate information extraction without expensive fine-tuning of the full LLM and to show the accuracy and the ability of these models to generalize for data extraction given just a few examples. In many respects, the LLMs were given the same amount of training examples as were the human annotators. The prompt used in our 7-shot prompting can be found in <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>.</p></sec><sec id="s2-8"><title>LLMs Evaluated</title><p>Four main types of models were used in this study: open-source models using Llama 3 from Meta [<xref ref-type="bibr" rid="ref23">23</xref>] and Gemma from Google [<xref ref-type="bibr" rid="ref24">24</xref>], along with proprietary models using GPT-3.5 [<xref ref-type="bibr" rid="ref25">25</xref>] and GPT-4 [<xref ref-type="bibr" rid="ref26">26</xref>] from OpenAI. The advantage of open-source models is that they are free to use, and the user has greater control of how the data are used and stored. Proprietary models, on the other hand, are perceived to be more accurate.</p><p>Llama 3 comes in 2 sizes: 8 billion parameters (7B) and 70 billion parameters (70B). Gemma is a 7 billion parameter (7B) model. Models with more parameters are usually more accurate, but they also require vastly more computing and storage resources to use. In comparison, GPT-3.5 has 175 billion parameters, and as there is no official disclosure on its size, it can be assumed to be much larger. The analysis used few-shot prompts containing 2 examples for all models except for GPT-3.5, which was run using both 2 examples and 7 examples. Two examples were used for GPT-4 due to the high cost of adding more examples. The Llama 3 and Gemma models were run using various sizes of context for comparison.</p></sec><sec id="s2-9"><title>Ethical Considerations</title><p>There are several ethical considerations when using LLMs to analyze health-related data. While Reddit posts are publicly available, users may not expect them to be used for medical research, and many posts may contain sensitive personal health-related information. In this study, no usernames, post IDs, or other identifying metadata were included in the analysis, with all data deidentified to reduce privacy risk. In addition, the research adhered to Reddit&#x2019;s API use policy. As LLMs become increasingly deployed for health-related research, it is essential to maintain strict adherence to privacy regulations such as the Health Insurance Portability and Accountability Act (HIPAA), obtain institutional review board approval when applicable, and consider on-premises model deployment (running LLMs locally rather than through a third party) to avoid the risk of leaking of sensitive data. IRB approval for this research was proposed, approved and classified as exempt level, category 4: Secondary research for which consent is not required.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>LLM Labeling Compared With Human Annotators</title><p>After running each model, the accuracy score (percentage agreement) was computed for the LLM results versus the gold standard (<xref ref-type="fig" rid="figure2">Figure 2</xref>). In terms of agreement with the gold standard, the top-performing models were the largest, with GPT-4 (2-shot) and Llama 3 70B (7-shot) having the highest overall accuracy. It appears that in general, GPT-4 had the highest agreement with the gold standard in the simpler fields, such as Biological Sex, Gender Identity, and Age, while Llama 3 70B performed marginally better in more subjective fields, with slightly higher percentage agreement scores in Diagnosis Based, Symptom Based, Treatment Based, and Chronic. However, this performance difference may be attributed to the larger number of few-shot examples used in the Llama 3.</p><p>One key finding was the agreement of LLMs in more subjective areas, such as determining whether a condition was chronic or assessing the health care consultation status. Like human annotators, LLMs encountered challenges in these subjective categories, reflecting the inherent complexity and nuanced understanding required to make these determinations.</p><p>Moreover, a side-by-side comparison of Cohen &#x03BA; scores between the top 2 performing LLMs (GPT-4 and Llama 3 70B) and a randomly selected pair of human annotators revealed similarities in the pattern of disagreements. These results, shown in <xref ref-type="fig" rid="figure4">Figure 4</xref>, show that the differences between either LLM or annotator A resemble the difference between annotators A and B. This observation suggests that the discrepancies between LLM outputs and the gold standard dataset may mirror the natural variance found in human labeling efforts. This highlights the capabilities of LLMs to approximate human-like understanding and judgment in complex categorization tasks.</p><fig position="float" id="figure4"><label>Figure 4.</label><caption><p>Cohen &#x03BA; scores: GPT-3.5 versus gold standard dataset and between 2 human annotators. LLM: large language model.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v27i1e74094_fig04.png"/></fig><p>A more detailed understanding of the differences between human and LLM data extraction was gained by examining the confidence intervals for agreement with the gold standard across each data field. <xref ref-type="fig" rid="figure5">Figure 5</xref> shows heatmaps of the confidence intervals. The top heatmap (A) is colored by the mean value of the confidence interval, while the lower heatmap (B) is colored by the confidence interval ranges. As seen previously, more subjective data fields tend to have lower mean values and wider confidence intervals. Notably, LLMs showed greater consistency than the 2 human annotators. The LLM confidence interval ranges are, in general, tighter than those of the human annotators. Of note is the Proxy Relationship data field. As mentioned previously, this is a particularly difficult data field to extract from the information in the posts. All the LLMs seem to struggle in the same way. Human annotators have a significantly larger confidence interval range. This may be due to human annotators making more judgment calls based on other information in the post.</p><fig position="float" id="figure5"><label>Figure 5.</label><caption><p>Agreement with gold standard 95% confidence interval heatmaps colored by mean value (A) and confidence interval range (B).</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v27i1e74094_fig05.png"/></fig></sec><sec id="s3-2"><title>Demographic Insights and Health Discourse Trends</title><p>With the methodology established, it was applied to approximately 30,000 randomly sampled Reddit posts using GPT-3.5. GPT-3.5 was used in this analysis because it offered a balance of speed, cost-efficiency, and accuracy for large-scale processing at a lower cost and time compared with GPT-4. This large-scale analysis demonstrates the unique insights that can be obtained from the health discourse on the AskDocs subreddit related to age distribution, the nature of inquiries, proxy relationship posts, and health topics discussed. This section serves to demonstrate the strengths of using LLMs to extract data that would be otherwise impractical using human extraction. It focuses on the capabilities of the data extraction method. Future research may involve a more comprehensive analysis, including treatment of missing or incomplete data.</p></sec><sec id="s3-3"><title>Age Distribution</title><p><xref ref-type="fig" rid="figure6">Figure 6</xref> illustrates the age distribution of users partaking in the AskDocs subreddit. The visible skew toward a younger demographic may reflect a generational trend in using online platforms for health-related guidance. This observation aligns with broader usage patterns on Reddit, where 44% of users are aged between 18 and 29 years, and 31% are aged between 30 and 49 years [<xref ref-type="bibr" rid="ref27">27</xref>], suggesting that the demographic trends observed in AskDocs may indeed be representative of the general Reddit user base.</p><fig position="float" id="figure6"><label>Figure 6.</label><caption><p>Age distribution of AskDocs users.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v27i1e74094_fig06.png"/></fig></sec><sec id="s3-4"><title>Nature of Inquiry</title><p>When the posts are segmented by the nature of the inquiry: Diagnosis-Based, Treatment-Based, or Symptom-Based, the results show that 85% are diagnosis-based posts, 66% contain treatment-based information, and 93% contain symptom-based information. Note that a single post may contain any or all 3 of the categories. The predominance of Symptom-Based queries suggests that users are often at an initial stage of seeking health information, which may involve checking symptoms, sometimes anonymously, before seeking formal medical consultation.</p></sec><sec id="s3-5"><title>Proxy Relationship Posts</title><p>The vast majority (95%) of inquiries are made by individuals concerning their own health, highlighting the subreddit&#x2019;s predominant role in facilitating personal health inquiries. When queries are made on behalf of others, they are predominantly for significant others or children (<xref ref-type="table" rid="table4">Table 4</xref>).</p><table-wrap id="t4" position="float"><label>Table 4.</label><caption><p>Proxy relationship posts as a percentage of total posts.</p></caption><table id="table4" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Proxy relationship</td><td align="left" valign="bottom">Percentage of all posts</td></tr></thead><tbody><tr><td align="left" valign="top">OP&#x2019;s<sup><xref ref-type="table-fn" rid="table4fn1">a</xref></sup> significant other</td><td align="char" char="." valign="top">1.765</td></tr><tr><td align="left" valign="top">OP&#x2019;s child</td><td align="char" char="." valign="top">1.459</td></tr><tr><td align="left" valign="top">OP&#x2019;s friend</td><td align="char" char="." valign="top">0.257</td></tr><tr><td align="left" valign="top">OP&#x2019;s sibling</td><td align="char" char="." valign="top">0.095</td></tr><tr><td align="left" valign="top">Other</td><td align="char" char="." valign="top">0.063</td></tr><tr><td align="left" valign="top">OP&#x2019;s parent</td><td align="char" char="." valign="top">0.025</td></tr><tr><td align="left" valign="top">OP&#x2019;s relative</td><td align="char" char="." valign="top">0.007</td></tr></tbody></table><table-wrap-foot><fn id="table4fn1"><p><sup>a</sup>OP&#x2019;s: original poster&#x2019;s.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s3-6"><title>Health Topics</title><p>Finally, <xref ref-type="fig" rid="figure7">Figure 7</xref> details what percentage of posts pertained to each of the top 10 most frequently discussed health topics. Notably, &#x201C;Anxiety&#x201D; and &#x201C;Respiratory Infections&#x201D; both saw marked increases in discussion volume from 2019 to 2020, likely influenced by the COVID-19 pandemic, reflecting public health trends and possibly exacerbated public anxieties [<xref ref-type="bibr" rid="ref28">28</xref>].</p><fig position="float" id="figure7"><label>Figure 7.</label><caption><p>Most common health topics discussed on AskDocs as a percentage of all posts over time.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v27i1e74094_fig07.png"/></fig></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Findings</title><p>This study aimed to assess the application of LLMs in systematically converting the rich, unstructured textual data from the Reddit r/AskDocs subreddit into a structured dataset, a method that more closely mirrors human cognitive processes than conventional data extraction techniques. This comparative analysis shed light on the efficacy of labeling via LLMs relative to that of human annotators in the nuanced domain of a web-based health forum such as Reddit. The insights garnered point toward both the strengths and limitations of current AI technologies in domain-specific content understanding, paving the way for further research and development in the field of digital health communication.</p><p>To further demonstrate the potential of the LLM-labeled data, a cursory analysis was conducted that revealed patterns and trends within the AskDocs subreddit community. Insights such as these have the potential to guide public health research, tailor medical advice services, and support targeted health information dissemination, although further validation across more diverse datasets and additional forums is necessary for broader applicability and verification of these results.</p><p>The findings shown in <xref ref-type="fig" rid="figure4">Figure 4</xref> indicate that Llama 3 70B with a 7 few-shot prompt and GPT-4 with a 2 few-shot prompt had the highest agreement with benchmark human-annotated data among the models run. Due to financial constraints, GPT-4 with a 7 few-shot prompt was not run. These results reflect the advanced capabilities of these models to understand and process complex health-related information. Few-shot examples may enhance the performance of LLMs by improving their ability to recognize specific patterns, as they help the model interpret tasks more accurately [<xref ref-type="bibr" rid="ref20">20</xref>,<xref ref-type="bibr" rid="ref29">29</xref>].</p><p>It is important to note that Llama 3 70B is an open-source model, which allows it to be downloaded and run locally without incurring additional costs. This feature becomes particularly significant when considering information privacy or scenarios where data sensitivity prohibits the use of web-based servers. Furthermore, the performance of the open-source Llama 3 70B is comparable with its proprietary counterpart, thereby enabling the application of these techniques in research contexts where such resources might otherwise be unavailable.</p><p>The pattern of disagreement between LLMs versus the gold standard and human annotators versus the gold standard exhibited notable similarities. Fields where human annotators had low agreement with the gold standard, as depicted in <xref ref-type="fig" rid="figure2">Figure 2</xref>, also posed challenges for LLMs. This suggests that the complexity of these questions may explain reduced LLM agreement, reflecting the same areas of human disagreement in the annotated dataset. For instance, fields with high disagreement among human annotators also showed low agreement between human annotators and LLMs, as shown by decreased accuracy. Conversely, fields where humans achieved high Cohen &#x03BA; scores, such as Biological Sex, also demonstrated high accuracy from LLMs.</p><p>This notion is further supported by the results illustrated in <xref ref-type="fig" rid="figure5">Figure 5</xref>. The comparison of Cohen &#x03BA; scores between our LLMs with the highest percentage agreement with the gold standard and a randomly selected pair of human annotators revealed that the pattern of disagreement between LLMs and human annotators mirrored the disagreement among the annotators themselves. This suggests that the consistency of LLMs is comparable with that of human annotators. It further indicates that LLMs can approximate human judgment, although perfect coding remains unlikely in subjective categories for both LLMs and humans. Therefore, when human annotators disagree with similar consistency as an LLM does with them, it may be reasonable to consider the LLM annotations with the same weight as those made by human annotators.</p><p>The ability of LLMs to extract health data from large-scale, unstructured sources such as Reddit posts has implications for clinicians and public health researchers. For example, health care providers may use the AskDocs subreddit to identify concerns related to emerging health issues, explore health-related misconceptions, and monitor the side effects of medications. From a public health perspective, understanding the health-related issues and their determinants is important for informing the development of interventions such as health communication campaigns, educational programs, policy initiatives, and environmental changes. In addition, LLMs could provide real-time health sentiment analysis, which public health organizations could leverage to improve interventions in response to changing conversations and attitudes in web-based communities.</p></sec><sec id="s4-2"><title>Operational and Economic Analysis</title><p>The utilization of LLMs for data extraction, especially within the domain of health care and online forums such as the AskDocs subreddit, offers advantages over traditional manual methods in terms of efficiency, consistency, scalability, and privacy.</p><p>Unlike human annotators, LLMs can analyze and extract data from thousands of documents in a fraction of the time. Rapid processing is valuable for time-sensitive tasks such as monitoring health forums for emergent public health concerns or extracting patient information from clinical notes in real time. Unlike humans, whose performance may fluctuate due to fatigue or subjective interpretation, LLMs maintain a high degree of reliability and consistency in data extraction tasks.</p><p>Scalability is another benefit, as LLMs can be parallelized and deployed across multiple servers, allowing for simultaneous processing of data from various sources. In addition, LLMs can be configured to extract relevant information while preserving user anonymity and discarding sensitive personal data, thus supporting compliance with regulations such as HIPAA [<xref ref-type="bibr" rid="ref30">30</xref>].</p><p>The costs and time required based on the current OpenAI API&#x2019;s tier 5 rate limit and token pricing [<xref ref-type="bibr" rid="ref31">31</xref>] are shown in <xref ref-type="table" rid="table5">Table 5</xref>. From an economic and operational standpoint, LLMs are very efficient. For example, labeling 30,000 posts manually at 1 post per minute would take 500 hours and cost approximately US $7500 at a rate of US $0.25 per post. In contrast, GPT-3.5 Turbo can perform the same task at a cost of only $285 and in just more than 4 hours, processing 117 posts per minute. The substantial cost and time efficiencies of GPT-3.5 Turbo compared with both human annotator and the more expensive GPT-4 model are notable. The ability to process large datasets quickly and economically with GPT-3.5 demonstrates its advantage for users needing high throughput and cost-effectiveness, whereas GPT-4 offers advanced capabilities at a higher expense. As the LLM landscape progresses, the costs associated with these technologies are expected to fluctuate, potentially making high-capability models such as GPT-4 more accessible.</p><table-wrap id="t5" position="float"><label>Table 5.</label><caption><p>Cost and time comparison in US dollars for labeling 30,000 posts using different methods.</p></caption><table id="table5" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Method</td><td align="left" valign="bottom">Cost per post</td><td align="left" valign="bottom">Total cost</td><td align="left" valign="bottom">Posts per minute</td><td align="left" valign="bottom">Total time (hours)</td></tr></thead><tbody><tr><td align="left" valign="top">Human</td><td align="left" valign="top">$0.25</td><td align="left" valign="top">$7500</td><td align="left" valign="top">1</td><td align="left" valign="top">500</td></tr><tr><td align="left" valign="top">GPT-3.5 Turbo</td><td align="left" valign="top">$0.0095</td><td align="left" valign="top">$285</td><td align="left" valign="top">117</td><td align="left" valign="top">4.27</td></tr><tr><td align="left" valign="top">GPT-4</td><td align="left" valign="top">$0.54</td><td align="left" valign="top">$16,200</td><td align="left" valign="top">18</td><td align="left" valign="top">28.33</td></tr><tr><td align="left" valign="top">LLama</td><td align="left" valign="top">Free</td><td align="left" valign="top">Free</td><td align="left" valign="top">19</td><td align="left" valign="top">26.31</td></tr></tbody></table></table-wrap></sec><sec id="s4-3"><title>Limitations and Future Research</title><p>Despite the demonstrated strengths of using LLMs in health-related information extraction, this research has limitations that pave the way for future research opportunities. LLM research is a rapidly advancing field, with new models and techniques regularly emerging. The methodologies used in this study could be refined by integrating state-of-the-art models and approaches that have been developed since the time of our research. Ongoing research should integrate the latest advancements to enhance the accuracy, efficiency, and reliability of data extraction processes.</p><p>Furthermore, the study faced financial limitations that restricted our ability to fully use GPT-4 with n-shot samples. Although data trends indicated that the models performed better with an increased number of n-shot samples, we were unable to conduct extensive experimentation with GPT-4 to its fullest potential. Consequently, the results might have been enhanced if we had the resources to conduct more extensive testing with additional n-shot samples.</p><p>A significant area for future research lies in applying these methodologies to analyze HIPAA-protected data. Currently, accessing such data involves complex legal processes to ensure privacy and compliance. By processing these data through an LLM, it may be possible to effectively extract the information without direct human access to protected information, thereby facilitating analysis that was previously hindered by legal and ethical constraints. Research exploring the extent to which LLMs can maintain data anonymity while still providing valuable insights would be highly beneficial. In addition, research should explore technical solutions such as on-premises deployment (running LLMs on local servers) to minimize exposing sensitive data and follow ethical guidelines for using AI in health care.</p><p>The introduction of new models with more extensive context lengths (allowing for longer prompts) provides an opportunity to include more examples in few-shot prompting, which may improve the model&#x2019;s understanding and execution of the data extraction task. Investigating whether the incorporation of more examples enhances the model&#x2019;s performance would provide valuable insights into the few-shot learning capabilities of LLMs. This research could involve experimental comparisons between models with varying context lengths to determine the optimal number of examples for accurate data extraction.</p><p>While this study targets the medical domain, particularly the AskDocs subreddit, the methodologies used can and should be validated in a broader range of domains. Future research should extend beyond health forums to encompass a wide array of fields, creating large-scale datasets and using LLMs for data extraction in each context. Comparing the performance of LLMs with expert human annotators and established automated methods across these varied domains is essential. This expanded benchmarking will not only solidify our understanding of LLMs&#x2019; practical limitations but also verify their reliability and adaptability to diverse applications. Such cross-domain validation will underscore the versatility of LLMs and inform their refinement for specialized tasks.</p><p>Finally, the user base of Reddit is generally younger and more technologically literate, which may bias these data toward population segments that may not reflect broader public demographics. As such, any policy-related decisions should be made in the context of the demographics of this study.</p></sec><sec id="s4-4"><title>Conclusions</title><p>While this study has laid the groundwork for understanding the strengths and limitations of the use of LLMs in extracting structured data from unstructured text, there remains significant potential for further exploration. In this study, LLMs&#x2019; performance was comparable with human annotators in extracting demographic and health-related information from unstructured posts on the AskDocs subreddit. LLMs offer many advantages, including scalability, consistency, and cost-efficiency, while processing tens of thousands of posts in a fraction of the time and cost required for manual annotation. However, both LLMs and human annotators struggled with subjective labeling fields, underscoring the complexity and ambiguity of some health-related posts. Further in-depth study surrounding the characterization of misclassifications and various model-specific weaknesses is part of ongoing research. Despite these challenges, accuracy in objective feature extraction and consistency in LLM and human disagreements for subjective features suggest that LLMs can be viable, scalable tools in digital health research. Continuous advancements in LLM technology, combined with rigorous research into their applications and implications, will enhance understanding and contribute to the evolution of digital health care research.</p></sec></sec></body><back><notes><sec><title>Data Availability</title><p>The datasets analyzed in this study are publicly available via the Pushshift application programming interface [<xref ref-type="bibr" rid="ref5">5</xref>], a third-party archive of Reddit&#x2019;s post metadata.</p></sec></notes><fn-group><fn fn-type="con"><p>CW and EL developed the code and drafted the first version of the paper. QS, CLH, and ESNT analyzed the data and wrote the final version of the paper. JW supervised the medical students doing the manual coding of the data. All authors participated in revisions for the final draft of the paper. The following medical students at the University of Nevada, Reno, contributed to the paper by doing the manual data extraction for the gold standard dataset: TC, MH, CS, MA, MS, RO, DM, JG, CM, CH, XZ, OA, LB, AG, MN, SM, JW, IR, and IR.</p></fn><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">AI</term><def><p>artificial intelligence</p></def></def-item><def-item><term id="abb2">API</term><def><p>application programming interface</p></def></def-item><def-item><term id="abb3">HIPAA</term><def><p>Health Insurance Portability and Accountability Act</p></def></def-item><def-item><term id="abb4">JSON</term><def><p>JavaScript Object Notation</p></def></def-item><def-item><term id="abb5">LLM</term><def><p>large language model</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Mann</surname><given-names>DM</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>J</given-names> </name><name name-style="western"><surname>Chunara</surname><given-names>R</given-names> </name><name name-style="western"><surname>Testa</surname><given-names>PA</given-names> </name><name name-style="western"><surname>Nov</surname><given-names>O</given-names> </name></person-group><article-title>COVID-19 transforms health care through telemedicine: evidence from the field</article-title><source>J Am Med Inform Assoc</source><year>2020</year><month>07</month><day>1</day><volume>27</volume><issue>7</issue><fpage>1132</fpage><lpage>1135</lpage><pub-id pub-id-type="doi">10.1093/jamia/ocaa072</pub-id><pub-id pub-id-type="medline">32324855</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Valdes</surname><given-names>D</given-names> </name><name name-style="western"><surname>Alqazlan</surname><given-names>L</given-names> </name><name name-style="western"><surname>Procter</surname><given-names>R</given-names> </name><name name-style="western"><surname>Dale</surname><given-names>J</given-names> </name></person-group><article-title>Global evidence on the rapid adoption of telemedicine in primary care during the first 2 years of the COVID-19 pandemic: a scoping review protocol</article-title><source>Syst Rev</source><year>2022</year><month>06</month><day>19</day><volume>11</volume><issue>1</issue><fpage>124</fpage><pub-id pub-id-type="doi">10.1186/s13643-022-01934-3</pub-id><pub-id pub-id-type="medline">35718770</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Nobles</surname><given-names>AL</given-names> </name><name name-style="western"><surname>Leas</surname><given-names>EC</given-names> </name><name name-style="western"><surname>Dredze</surname><given-names>M</given-names> </name><name name-style="western"><surname>Ayers</surname><given-names>JW</given-names> </name></person-group><article-title>Examining peer-to-peer and patient-provider interactions on a social media community facilitating ask the doctor services</article-title><source>ICWSM</source><year>2020</year><volume>14</volume><fpage>464</fpage><lpage>475</lpage><pub-id pub-id-type="doi">10.1609/icwsm.v14i1.7315</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Reissner</surname><given-names>H</given-names> </name><name name-style="western"><surname>Ponzio</surname><given-names>MR</given-names> </name><name name-style="western"><surname>Nagatani-Short</surname><given-names>L</given-names> </name><name name-style="western"><surname>Hurtado</surname><given-names>A</given-names> </name><name name-style="western"><surname>Nguyen</surname><given-names>B</given-names> </name></person-group><article-title>Medication abortion experiences before and during the COVID-19 pandemic: a content analysis of online Reddit posts [A19]</article-title><source>Obstet Gynecol</source><year>2022</year><volume>139</volume><issue>1</issue><fpage>6S</fpage><lpage>6S</lpage><pub-id pub-id-type="doi">10.1097/01.AOG.0000826412.27746.c5</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Buntinx-Krieg</surname><given-names>T</given-names> </name><name name-style="western"><surname>Caravaglio</surname><given-names>J</given-names> </name><name name-style="western"><surname>Domozych</surname><given-names>R</given-names> </name><name name-style="western"><surname>Dellavalle</surname><given-names>RP</given-names> </name></person-group><article-title>Dermatology on Reddit: elucidating trends in dermatologic communications on the world wide web</article-title><source>Dermatol Online J</source><year>2017</year><month>07</month><day>15</day><volume>23</volume><issue>7</issue><fpage>13030/qt9dr1f7x6</fpage><pub-id pub-id-type="medline">29469693</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Michael</surname><given-names>LG</given-names> </name><name name-style="western"><surname>Donohue</surname><given-names>J</given-names> </name><name name-style="western"><surname>Davis</surname><given-names>JC</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>D</given-names> </name><name name-style="western"><surname>Servant</surname><given-names>F</given-names> </name></person-group><article-title>Regexes are hard: decision-making, difficulties, and risks in programming regular expressions</article-title><conf-name>2019 34th IEEE/ACM International Conference on Automated Software Engineering (ASE)</conf-name><conf-date>Nov 11-15, 2019</conf-date><conf-loc>San Diego, CA</conf-loc><pub-id pub-id-type="doi">10.1109/ASE.2019.00047</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Hassanin</surname><given-names>M</given-names> </name><name name-style="western"><surname>Moustafa</surname><given-names>N</given-names> </name></person-group><article-title>A comprehensive overview of large language models (LLMs) for cyber defences: opportunities and directions</article-title><source>arXiv</source><comment>Preprint posted online on  May 23, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2405.14487</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Yang</surname><given-names>X</given-names> </name><etal/></person-group><article-title>GatorTron: a large clinical language model to unlock patient information from unstructured electronic health records</article-title><source>arXiv</source><comment>Preprint posted online on  Feb 2, 2022</comment><pub-id pub-id-type="doi">10.48550/arXiv.2203.03540</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Plant</surname><given-names>R</given-names> </name><etal/></person-group><article-title>You are what you write: preserving privacy in the era of large language models</article-title><source>arXiv</source><comment>Preprint posted online on  Apr 20, 2022</comment><pub-id pub-id-type="doi">10.48550/arXiv.2204.09391</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Agrawal</surname><given-names>M</given-names> </name><name name-style="western"><surname>Hegselmann</surname><given-names>S</given-names> </name><name name-style="western"><surname>Lang</surname><given-names>H</given-names> </name><name name-style="western"><surname>Kim</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Sontag</surname><given-names>D</given-names> </name></person-group><article-title>Large language models are few-shot clinical information extractors</article-title><year>2022</year><conf-name>Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing</conf-name><conf-date>Dec 7-11, 2022</conf-date><conf-loc>Abu Dhabi, United Arab Emirates</conf-loc><pub-id pub-id-type="doi">10.18653/v1/2022.emnlp-main.130</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Dunn</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Structured information extraction from complex scientific text with fine-tuned large language models</article-title><source>arXiv</source><comment>Preprint posted online on  Dec 10, 2022</comment><pub-id pub-id-type="doi">10.48550/arXiv.2212.05238</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Breton</surname><given-names>J</given-names> </name><name name-style="western"><surname>Billami</surname><given-names>MM</given-names> </name><name name-style="western"><surname>Chevalier</surname><given-names>M</given-names> </name><etal/></person-group><article-title>Leveraging LLMs for legal terms extraction with limited annotated data</article-title><source>Artif Intell Law</source><year>2025</year><pub-id pub-id-type="doi">10.1007/s10506-025-09448-8</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Deng</surname><given-names>X</given-names> </name><name name-style="western"><surname>Bashlovkina</surname><given-names>V</given-names> </name><name name-style="western"><surname>Han</surname><given-names>F</given-names> </name><name name-style="western"><surname>Baumgartner</surname><given-names>S</given-names> </name><name name-style="western"><surname>Bendersky</surname><given-names>M</given-names> </name></person-group><article-title>LLMs to the moon? Reddit market sentiment analysis with large language models</article-title><conf-name>WWW '23 Companion: Companion Proceedings of the ACM Web Conference 2023</conf-name><conf-date>Apr 30 to May 4, 2023</conf-date><conf-loc>Austin, TX</conf-loc><pub-id pub-id-type="doi">10.1145/3543873.3587605</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Li</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>S</given-names> </name><name name-style="western"><surname>Ding</surname><given-names>H</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>H</given-names> </name></person-group><article-title>Large language models in finance: a survey</article-title><year>2023</year><month>11</month><day>27</day><conf-name>ICAIF &#x2019;23</conf-name><conf-date>Nov 27-29, 2023</conf-date><conf-loc>Brooklyn, NY</conf-loc><fpage>374</fpage><lpage>382</lpage><pub-id pub-id-type="doi">10.1145/3604237.3626869</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Shu</surname><given-names>D</given-names> </name><name name-style="western"><surname>Zhao</surname><given-names>H</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>X</given-names> </name><name name-style="western"><surname>Demeter</surname><given-names>D</given-names> </name><name name-style="western"><surname>Du</surname><given-names>M</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>Y</given-names> </name></person-group><article-title>LawLLM: law large language model for the US legal system</article-title><conf-name>CIKM &#x2019;24: Proceedings of the 33rd ACM International Conference on Information and Knowledge Management</conf-name><conf-date>Oct 21-25, 2024</conf-date><conf-loc>Boise, ID</conf-loc><fpage>4882</fpage><lpage>4889</lpage><pub-id pub-id-type="doi">10.1145/3627673.3680020</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="web"><article-title>Press</article-title><source>Reddit</source><access-date>2023-01-21</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.redditinc.com/press">https://www.redditinc.com/press</ext-link></comment></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Baumgartner</surname><given-names>J</given-names> </name><name name-style="western"><surname>Zannettou</surname><given-names>S</given-names> </name><name name-style="western"><surname>Keegan</surname><given-names>B</given-names> </name><name name-style="western"><surname>Squire</surname><given-names>M</given-names> </name><name name-style="western"><surname>Blackburn</surname><given-names>J</given-names> </name></person-group><article-title>The Pushshift Reddit dataset</article-title><source>ICWSM</source><year>2020</year><month>05</month><volume>14</volume><issue>1</issue><fpage>830</fpage><lpage>839</lpage><pub-id pub-id-type="doi">10.1609/icwsm.v14i1.7347</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Cohen</surname><given-names>J</given-names> </name></person-group><article-title>A coefficient of agreement for nominal scales</article-title><source>Educ Psychol Meas</source><year>1960</year><month>04</month><volume>20</volume><issue>1</issue><fpage>37</fpage><lpage>46</lpage><pub-id pub-id-type="doi">10.1177/001316446002000104</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>McHugh</surname><given-names>ML</given-names> </name></person-group><article-title>Interrater reliability: the kappa statistic</article-title><source>Biochem Med (Zagreb)</source><year>2012</year><volume>22</volume><issue>3</issue><fpage>276</fpage><lpage>282</lpage><pub-id pub-id-type="medline">23092060</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Brown</surname><given-names>TB</given-names> </name><etal/></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Larochelle</surname><given-names>H</given-names> </name><name name-style="western"><surname>Ranzato</surname><given-names>M</given-names> </name><name name-style="western"><surname>Hadsell</surname><given-names>R</given-names> </name><name name-style="western"><surname>Balcan</surname><given-names>MF</given-names> </name><name name-style="western"><surname>Lin</surname><given-names>H</given-names> </name></person-group><article-title>Language models are few-shot learners</article-title><year>2020</year><conf-name>NIPS &#x2019;20: Proceedings of the 34th International Conference on Neural Information Processing Systems</conf-name><conf-date>Dec 6-12, 2020</conf-date><conf-loc>Vancouver, BC</conf-loc><fpage>1877</fpage><lpage>1901</lpage></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Ahmed</surname><given-names>T</given-names> </name><name name-style="western"><surname>Devanbu</surname><given-names>P</given-names> </name></person-group><article-title>Few-shot training LLMs for project-specific code-summarization</article-title><year>2023</year><conf-name>ASE &#x2019;22: Proceedings of the 37th IEEE/ACM International Conference on Automated Software Engineering</conf-name><conf-date>Oct 10-14, 2023</conf-date><conf-loc>Rochester, MI</conf-loc><pub-id pub-id-type="doi">10.1145/3551349.3559555</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Zhao</surname><given-names>S</given-names> </name><name name-style="western"><surname>Wallace</surname><given-names>E</given-names> </name><name name-style="western"><surname>Singh</surname><given-names>S</given-names> </name></person-group><article-title>Calibrate before use: improving few-shot performance of language models</article-title><conf-name>Proceedings of the 38th International Conference on Machine Learning, volume 139 of PMLR</conf-name><conf-date>Jul 18-24, 2021</conf-date><conf-loc>Virtual</conf-loc></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Touvron</surname><given-names>H</given-names> </name><etal/></person-group><article-title>Llama 2: open foundation and fine-tuned chat models</article-title><source>arXiv</source><comment>Preprint posted online on  Jul 18, 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2307.09288</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="other"><person-group person-group-type="author"><collab>Gemma Team</collab><name name-style="western"><surname>Mesnard</surname><given-names>T</given-names> </name><name name-style="western"><surname>Hardin</surname><given-names>C</given-names> </name><etal/></person-group><article-title>Gemma: open models based on Gemini research and technology</article-title><source>arXiv</source><comment>Preprint posted online on  Mar 13, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2403.08295</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Brown</surname><given-names>TB</given-names> </name><etal/></person-group><article-title>Language models are few-shot learners</article-title><source>arXiv</source><comment>Preprint posted online on  May 28, 2020</comment><pub-id pub-id-type="doi">10.48550/arXiv.2005.14165</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="other"><person-group person-group-type="author"><collab>OpenAI</collab><name name-style="western"><surname>Achiam</surname><given-names>J</given-names> </name><name name-style="western"><surname>Adler</surname><given-names>S</given-names> </name><etal/></person-group><article-title>GPT-4 technical report</article-title><source>arXiv</source><comment>Preprint posted online on  Mar 15, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2303.08774</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="web"><article-title>Percentage of US adults who use Reddit as of September 2023, by age group</article-title><source>MarketingCharts</source><year>2024</year><access-date>2024-04-12</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.statista.com/statistics/261766/share-of-us-internet-userswho-use-reddit-by-age-group">https://www.statista.com/statistics/261766/share-of-us-internet-userswho-use-reddit-by-age-group</ext-link></comment></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kindred</surname><given-names>R</given-names> </name><name name-style="western"><surname>Bates</surname><given-names>GW</given-names> </name></person-group><article-title>The influence of the COVID-19 pandemic on social anxiety: a systematic review</article-title><source>Int J Environ Res Public Health</source><year>2023</year><month>01</month><day>29</day><volume>20</volume><issue>3</issue><fpage>2362</fpage><pub-id pub-id-type="doi">10.3390/ijerph20032362</pub-id><pub-id pub-id-type="medline">36767728</pub-id></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Chae</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Davidson</surname><given-names>T</given-names> </name></person-group><article-title>Large language models for text classification: from zero-shot learning to fine-tuning</article-title><source>Sociol Methods Res</source><year>2023</year><pub-id pub-id-type="doi">10.1177/00491241251325243</pub-id></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Yao</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Duan</surname><given-names>J</given-names> </name><name name-style="western"><surname>Xu</surname><given-names>K</given-names> </name><name name-style="western"><surname>Cai</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Sun</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>Y</given-names> </name></person-group><article-title>A survey on large language model (LLM) security and privacy: the good, the bad, and the ugly</article-title><source>High Confidence Comput</source><year>2024</year><month>06</month><volume>4</volume><issue>2</issue><fpage>100211</fpage><pub-id pub-id-type="doi">10.1016/j.hcc.2024.100211</pub-id></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="web"><article-title>OpenAI API documentation</article-title><source>OpenAI</source><access-date>2024-10-04</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://platform.openai.com/docs/api-reference/introduction">https://platform.openai.com/docs/api-reference/introduction</ext-link></comment></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Labeling guidelines.</p><media xlink:href="jmir_v27i1e74094_app1.docx" xlink:title="DOCX File, 23 KB"/></supplementary-material><supplementary-material id="app2"><label>Multimedia Appendix 2</label><p>Example prompt for a large language model.</p><media xlink:href="jmir_v27i1e74094_app2.docx" xlink:title="DOCX File, 22 KB"/></supplementary-material></app-group></back></article>