<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">J Med Internet Res</journal-id><journal-id journal-id-type="publisher-id">jmir</journal-id><journal-id journal-id-type="index">1</journal-id><journal-title>Journal of Medical Internet Research</journal-title><abbrev-journal-title>J Med Internet Res</abbrev-journal-title><issn pub-type="epub">1438-8871</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v28i1e92413</article-id><article-id pub-id-type="doi">10.2196/92413</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Extracting Medical Information From Unstructured Clinical Text Using Large Language Models to Enhance Health Care Interoperability: Proof-of-Concept Study</article-title></title-group><contrib-group><contrib contrib-type="author" corresp="yes" equal-contrib="yes"><name name-style="western"><surname>Ery&#x0131;lmaz</surname><given-names>Bahad&#x0131;r</given-names></name><degrees>MSc</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib><contrib contrib-type="author" equal-contrib="yes"><name name-style="western"><surname>Arzideh</surname><given-names>Kamyar</given-names></name><degrees>MSc</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff3">3</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Bahn</surname><given-names>Mikel</given-names></name><degrees>MSc</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Damm</surname><given-names>Hendrik</given-names></name><degrees>MSc</degrees><xref ref-type="aff" rid="aff4">4</xref><xref ref-type="aff" rid="aff5">5</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Warmer</surname><given-names>Sina</given-names></name><degrees>MSc</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Sch&#x00E4;fer</surname><given-names>Henning</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff6">6</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Idrissi-Yaghir</surname><given-names>Ahmad</given-names></name><degrees>MSc</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Pakull</surname><given-names>Tabea M G</given-names></name><degrees>MSc</degrees><xref ref-type="aff" rid="aff4">4</xref><xref ref-type="aff" rid="aff6">6</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Albrecht</surname><given-names>Lea Jessica</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff7">7</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Kleesiek</surname><given-names>Jens</given-names></name><degrees>Prof Dr</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Lodde</surname><given-names>Georg</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff7">7</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Friedrich</surname><given-names>Christoph M</given-names></name><degrees>Prof Dr</degrees><xref ref-type="aff" rid="aff4">4</xref><xref ref-type="aff" rid="aff5">5</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Livingstone</surname><given-names>Elisabeth</given-names></name><degrees>Prof Dr</degrees><xref ref-type="aff" rid="aff7">7</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Schadendorf</surname><given-names>Dirk</given-names></name><degrees>Prof Dr</degrees><xref ref-type="aff" rid="aff7">7</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Borys</surname><given-names>Katarzyna</given-names></name><degrees>MSc</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author" equal-contrib="yes"><name name-style="western"><surname>Nensa</surname><given-names>Felix</given-names></name><degrees>Prof Dr</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib><contrib contrib-type="author" equal-contrib="yes"><name name-style="western"><surname>Hosch</surname><given-names>Ren&#x00E9;</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib></contrib-group><aff id="aff1"><institution>University Hospital Essen, Institute for Artificial Intelligence in Medicine (IKIM)</institution><addr-line>Girardetstra&#x00DF;e 2</addr-line><addr-line>Essen</addr-line><addr-line>NRW</addr-line><country>Germany</country></aff><aff id="aff2"><institution>University Hospital Essen, Institute of Diagnostic and Interventional Radiology and Neuroradiology</institution><addr-line>Essen</addr-line><addr-line>NRW</addr-line><country>Germany</country></aff><aff id="aff3"><institution>Central IT Department</institution><addr-line>Essen</addr-line><addr-line>NRW</addr-line><country>Germany</country></aff><aff id="aff4"><institution>Department of Computer Science, University of Applied Sciences and Arts Dortmund</institution><addr-line>Dortmund</addr-line><addr-line>NRW</addr-line><country>Germany</country></aff><aff id="aff5"><institution>University Hospital Essen, Institute of Medical Informatics, Biometry and Epidemiology</institution><addr-line>Essen</addr-line><addr-line>NRW</addr-line><country>Germany</country></aff><aff id="aff6"><institution>University Hospital Essen, Institute for Transfusion Medicine</institution><addr-line>Essen</addr-line><addr-line>NRW</addr-line><country>Germany</country></aff><aff id="aff7"><institution>Department of Dermatology, University Hospital Essen</institution><addr-line>Essen</addr-line><country>Germany</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Sarvestan</surname><given-names>Javad</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Arora</surname><given-names>Akshay</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Palama</surname><given-names>Valentina</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Liu</surname><given-names>Zhao</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Bahad&#x0131;r Ery&#x0131;lmaz, MSc, University Hospital Essen, Institute for Artificial Intelligence in Medicine (IKIM), Girardetstra&#x00DF;e 2, Essen, NRW, 45131, Germany; <email>bahadir.eryilmaz@uk-essen.de</email></corresp><fn fn-type="equal" id="equal-contrib1"><label>*</label><p>these authors contributed equally</p></fn></author-notes><pub-date pub-type="collection"><year>2026</year></pub-date><pub-date pub-type="epub"><day>2</day><month>7</month><year>2026</year></pub-date><volume>28</volume><elocation-id>e92413</elocation-id><history><date date-type="received"><day>29</day><month>01</month><year>2026</year></date><date date-type="rev-recd"><day>15</day><month>05</month><year>2026</year></date><date date-type="accepted"><day>27</day><month>05</month><year>2026</year></date></history><copyright-statement>&#x00A9; Bahad&#x0131;r Ery&#x0131;lmaz, Kamyar Arzideh, Mikel Bahn, Hendrik Damm, Sina Warmer, Henning Sch&#x00E4;fer, Ahmad Idrissi-Yaghir, Tabea M G Pakull, Lea Jessica Albrecht, Jens Kleesiek, Georg Lodde, Christoph M Friedrich, Elisabeth Livingstone, Dirk Schadendorf, Katarzyna Borys, Felix Nensa, Ren&#x00E9; Hosch. Originally published in the Journal of Medical Internet Research (<ext-link ext-link-type="uri" xlink:href="https://www.jmir.org">https://www.jmir.org</ext-link>), 2.7.2026. </copyright-statement><copyright-year>2026</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in the Journal of Medical Internet Research (ISSN 1438-8871), is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://www.jmir.org/">https://www.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://www.jmir.org/2026/1/e92413"/><abstract><sec><title>Background</title><p>Unstructured clinical text remains a major barrier to interoperable data reuse and large-scale secondary analysis in health care. Large language models (LLMs) have the potential to automate the extraction of structured clinical information; however, their application is limited by the scarcity of high-quality annotated training data.</p></sec><sec><title>Objective</title><p>To address these limitations, this study aims to develop and validate a scalable, privacy-preserving framework that uses synthetic data generated from structured Fast Healthcare Interoperability Resources (FHIR) to fine-tune open-source LLMs for the effective extraction of interoperable clinical information from unstructured text.</p></sec><sec sec-type="methods"><title>Methods</title><p>We evaluated an LLM-based framework for extracting structured clinical information from cancer-related discharge letters and mapping it to representations compatible with FHIR. To enable large-scale supervised training, we developed a random sample generator that creates synthetic discharge letters using Qwen3-235B by randomly sampling and aggregating structured FHIR data from 41,175 patients with cancer. The resulting synthetic discharge letters (n=75,000) were paired with their originating structured data, forming a large-scale dataset for fine-tuning MedGemma 27B, a 27-billion-parameter medical language model. Evaluation was conducted on the synthetic test dataset (n=7500), real-world discharge letters (n=30), which were evaluated by physicians and a medical student, and a comparative one-shot approach using open-source models (Qwen3, LLaMA, and GPT-OSS).</p></sec><sec sec-type="results"><title>Results</title><p>The fine-tuned model achieved high extraction performance across multiple clinical entities on the synthetic test set, with <italic>F</italic><sub>1</sub>-scores of 0.84 for full <italic>International Classification of Diseases</italic> diagnosis codes, 0.99 for tumor-related information, 0.99 for laboratory values, 0.99 for medication names and dosages, and 0.94 for Anatomical Therapeutic Chemical medication codes. The extraction of procedure-related information was more challenging, with <italic>F</italic><sub>1</sub>-scores of 0.63 for OPS codes and 0.90 for procedure descriptions. The fine-tuned model consistently outperformed general-purpose LLMs in a one-shot comparison across nearly all extraction categories. When evaluated by physicians on real-world discharge letters, the model achieved case-level correctness rates of 78.9% for <italic>International Classification of Diseases</italic> diagnoses, 86.1% for tumor-related information, 93.0% for medications, and 61.3% for procedures.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>These results demonstrate that synthetic text generation from structured clinical data enables the effective and scalable training of LLMs for extracting interoperable, multientity clinical information from unstructured documentation.</p></sec></abstract><kwd-group><kwd>large language models</kwd><kwd>artificial intelligence</kwd><kwd>AI in health care</kwd><kwd>interoperability</kwd><kwd>Fast Healthcare Interoperability Resources</kwd><kwd>FHIR</kwd><kwd>entity extraction</kwd><kwd>generative AI</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>Electronic health records (EHRs) are popular in modern health care, serving as digital repositories of patient events, diagnoses, procedures, and observations. They are relevant not only for clinical care but also for health care operations, quality management, and medical research [<xref ref-type="bibr" rid="ref1">1</xref>]. However, processing EHR data remains a complex challenge [<xref ref-type="bibr" rid="ref2">2</xref>], particularly the unstructured portion, which constitutes approximately 80% of all EHR data [<xref ref-type="bibr" rid="ref3">3</xref>]. Clinical narratives, such as discharge letters and progress notes, are rich in contextual detail but are difficult to reuse due to a lack of standardization, high linguistic diversity, and strict privacy considerations [<xref ref-type="bibr" rid="ref4">4</xref>]. These issues limit the ability to extract and leverage valuable information from free-text clinical documentation efficiently. In Germany, hospitals generate vast volumes of unstructured EHR data each year [<xref ref-type="bibr" rid="ref5">5</xref>]. For example, based on a query of the hospital information system conducted in 2025, the University Hospital Essen produces approximately 140,000 discharge letters and 2.9 million progress notes annually. Unstructured clinical text often contains richer and more nuanced information than structured data, underscoring its potential value for secondary use [<xref ref-type="bibr" rid="ref6">6</xref>]. Consequently, transforming unstructured data into structured, machine-readable formats has become a primary focus of medical informatics [<xref ref-type="bibr" rid="ref7">7</xref>].</p><p>To achieve this structural transformation effectively, the domain has increasingly converged on the Health Level Seven Fast Healthcare Interoperability Resources (FHIR) standard [<xref ref-type="bibr" rid="ref8">8</xref>]. Unlike legacy formats, FHIR uses a modern, web-based approach to represent clinical data as granular, independent <italic>resources</italic>, such as <italic>Conditions</italic>, <italic>Procedures</italic>, or <italic>Observations</italic>, enabling seamless data exchange and standardized representation. Standards such as FHIR are pivotal for achieving semantic interoperability, ensuring that medical information extracted from isolated notes is universally understood by downstream applications and research platforms. This development is further reinforced by large-scale initiatives such as the German Medizininformatik-Initiative [<xref ref-type="bibr" rid="ref9">9</xref>] and the emerging European Health Data Space [<xref ref-type="bibr" rid="ref10">10</xref>], both of which position FHIR as a central interoperability standard for cross-institutional data sharing and secondary use of health data. However, while FHIR provides the necessary framework for interoperable health care data, automatically extracting relevant information from clinical documents remains an open issue. Early work by Li et al [<xref ref-type="bibr" rid="ref11">11</xref>] introduced FHIR-GPT, a framework leveraging large language models (LLMs) to convert unstructured clinical narratives into standardized FHIR resources, demonstrating that LLM-based approaches can enhance health data interoperability without relying on complex, multistep natural language processing (NLP) pipelines.</p><p>At the same time, recent advancements in NLP, particularly the rise of LLMs, have introduced promising new capabilities for processing unstructured medical text [<xref ref-type="bibr" rid="ref12">12</xref>-<xref ref-type="bibr" rid="ref18">18</xref>]. A generative approach by Majid et al [<xref ref-type="bibr" rid="ref19">19</xref>] compared encoder-only architectures against generative LLMs in a cohort of ophthalmology patients, demonstrating that modern generative models can achieve superior performance in extracting named entities from unstructured medical reports. Recent initiatives [<xref ref-type="bibr" rid="ref20">20</xref>,<xref ref-type="bibr" rid="ref21">21</xref>] have focused on integrating open-source LLMs into German clinical workflows to enhance information extraction. Despite these promising advancements, the field continues to grapple with significant challenges: the scarcity of training data and the prohibitive time and effort required for the manual evaluation of unstructured clinical text. To address these bottlenecks, synthetic data generation has emerged as a crucial strategy, increasingly recognized as a powerful solution to overcome data scarcity in clinical NLP [<xref ref-type="bibr" rid="ref22">22</xref>-<xref ref-type="bibr" rid="ref24">24</xref>].</p><p>In this work, we introduce PIGEON (Patient Information Generation from Organized Notes). Rather than presenting merely another fine-tuned extraction model, PIGEON establishes a scalable paradigm for generating synthetic supervised training data directly from interoperable clinical backends (FHIR). This approach enables the robust fine-tuning of open-source LLMs under strict data governance constraints. We demonstrate this framework by fine-tuning MedGemma 27B using data from a production FHIR R4 server containing over 2 billion resources. We have validated PIGEON&#x2019;s efficacy through synthetic dataset benchmarking and assessed its real-world performance via clinician review of predictions on real-world discharge letters.</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Ethical Considerations</title><p>This study was approved by the Ethics Committee of the University Hospital Essen (approval number 24&#x2010;12111-BO). Due to the study&#x2019;s retrospective nature, the requirement for written informed consent was waived by the Ethics Committee.</p></sec><sec id="s2-2"><title>Discharge Letter Generator</title><p>To generate input-output pairs from the FHIR cache, we used a synthetic discharge letter-generation process with the Qwen3-235B LLM [<xref ref-type="bibr" rid="ref25">25</xref>]. This approach used the FHIR cache to create realistic discharge letters by selecting relevant FHIR resources and formatting them into coherent document sections. The goal was to simulate the unstructured nature found in authentic discharge letters while maintaining control over the content and structure. The complete workflow architecture is illustrated in <xref ref-type="fig" rid="figure1">Figure 1</xref>.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Schematic overview of the PIGEON (Patient Information Generation from Organized Notes) workflow for extracting clinical data. The process begins with querying patient cohort statistics from the Fast Healthcare Interoperability Resources (FHIR) repository to populate a local FHIR cache. The &#x201C;Random Sample Generator&#x201D; then creates a synthetic training dataset by pairing structured patient data (transformed into an intermediate JSON format/Labels) with synthetic clinical narratives (&#x201C;discharge letters&#x201D;) generated via Qwen3 and recipes. The resulting PIGEON dataset is split (0.9 ratio) to fine-tune the MedGemma 27B model. The framework includes a validation feedback loop to refine generation based on test scores, concluding with a final performance evaluation against human-annotated real-world data. SQL: Structured Query Language.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v28i1e92413_fig01.png"/></fig><p>Following this, a synthetic dataset, hereafter referred to as the PIGEON dataset, is generated from this cache using the random sample generator. The described workflow followed an iterative approach, allowing the refinement of the data selection rules and optimization of the instructional prompts. This generator uses a scalable, class-based random selection of FHIR resources to produce synthetic discharge letters that closely resemble authentic clinical correspondence. This random selection happens through rule-based templates (recipes), which tell the generator exactly which data to use for generation. Crucially, to preclude any risk of memorization or structural leakage from real patient records, no actual discharge letters were used as prompts within this framework. Instead, all narrative variability is strictly derived either through rule-based recipes or via LLM generation conditioned exclusively on the structured FHIR data. In total, 149,000 recipes were derived from the available FHIR cache. A detailed illustration of this process and the randomization steps is demonstrated in <xref ref-type="fig" rid="figure2">Figure 2</xref>.</p><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>Detailed schema of the synthetic text generation pipeline. The workflow transforms structured input data (left column) into coherent synthetic discharge letters (right column) through a randomized logic layer (middle column). For structured sections such as Introduction, Diagnosis, Medication, and Laboratory, the system uses the &#x201C;Random Sample Generator&#x201D; to select from predefined templates and toggle specific parameters, such as the inclusion of dates or addresses. Conversely, narrative-heavy sections like the Brief Hospital Course (BHC) and Medical History are generated via large language model (LLM) prompting to introduce linguistic variability and simulate realistic free-text reporting.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v28i1e92413_fig02.png"/></fig><p>To ensure data consistency across input-output pairs, multiple recipes are generated per patient. The generator accepts a recipe as input, retrieves the corresponding data, and constructs specific sections using a set of structural and content rules and the Qwen3-235B vLLM end point. Ultimately, the generator produces both the associated labels and the final synthetic discharge letter.</p></sec><sec id="s2-3"><title>Synthetic Discharge Letter Structure</title><p>The synthetic letters were generated by reading the recipe information and the selected FHIR resources from the FHIR cache and mapping them to predefined sections. Each section was populated using relevant resource types, as outlined in <xref ref-type="table" rid="table1">Table 1</xref>.</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Semantic mapping of structured FHIR<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup> resources to defined sections of the synthetic discharge letters.<sup><xref ref-type="table-fn" rid="table1fn2">b</xref></sup></p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Letter section</td><td align="left" valign="bottom">Relevant FHIR resource types</td><td align="left" valign="bottom">Type of medical data</td></tr></thead><tbody><tr><td align="left" valign="top">Greeting</td><td align="left" valign="top">Patient, Encounter</td><td align="left" valign="top">Patient information and stay duration</td></tr><tr><td align="left" valign="top">Main Diagnosis</td><td align="left" valign="top">Condition</td><td align="left" valign="top">Diagnose and date</td></tr><tr><td align="left" valign="top">Side Diagnosis</td><td align="left" valign="top">Condition</td><td align="left" valign="top">Diagnosis and date</td></tr><tr><td align="left" valign="top">Tumor Data</td><td align="left" valign="top">Condition, Procedure, Observation</td><td align="left" valign="top">Comprehensive tumor documentation</td></tr><tr><td align="left" valign="top">Anamneses</td><td align="left" valign="top">Condition, Procedure, Observation, Medications</td><td align="left" valign="top">Diagnosis, procedures, lab values, vital signs and body information, medications</td></tr><tr><td align="left" valign="top">Clinical Course</td><td align="left" valign="top">Condition, Procedure, Observation, Medications</td><td align="left" valign="top">Diagnosis, procedures, lab values, medications</td></tr><tr><td align="left" valign="top">Lab</td><td align="left" valign="top">Observation</td><td align="left" valign="top">Lab values</td></tr><tr><td align="left" valign="top">Medications</td><td align="left" valign="top">Medication, MedicationStatement</td><td align="left" valign="top">Medications, medication dosages</td></tr></tbody></table><table-wrap-foot><fn id="table1fn1"><p><sup>a</sup>FHIR: Fast Healthcare Interoperability Resources.</p></fn><fn id="table1fn2"><p><sup>b</sup>The table shows the specific resource types used to populate each narrative component, ensuring that clinical data elements (eg, diagnoses, medications, procedures) are accurately represented in the generated documentation context.</p></fn></table-wrap-foot></table-wrap><p>To introduce variability and enhance sample diversity, multiple randomization strategies were applied during resource selection and letter composition including generating multiple variants of diagnoses, several templates for non-LLM sections, previously explained recipe logic, and dynamic prompting of LLM-generated sections. Additional randomization was applied to dates, addresses, document metadata, and overall styling. While most narrative content such as clinical course and medical history is generated using the LLM, structured tables for lab results and medications are generated deterministically and appended to the letter. Based on an analysis of authentic documents, we identified 5 standard lab value section formats and 6 types of medication sections. These formats were hard-coded and selected randomly during generation to match observed documentation styles (Supplement D in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). An example of an anonymized and simplified synthetic discharge letter is shown in <xref ref-type="fig" rid="figure3">Figure 3</xref> along with the intermediate JSON format.</p><fig position="float" id="figure3"><label>Figure 3.</label><caption><p>Anonymized and translated discharge letter with output from the model side-by-side. Extracted entities are color-coded. For demonstration purposes, the letter and JSON are cut, and other entities extracted by the model are not shown. These excluded entities include medications, lab values, and diagnoses.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v28i1e92413_fig03.png"/></fig><p>This JSON format is designed to be populated based on the number of entities and hierarchical levels present in the discharge letter. With the exception of the introduction and tumor information sections, the remaining fields can accommodate multiple dictionaries, each of which has the potential to be postprocessed into an FHIR resource. In the figure, the medical entities to be extracted are highlighted in their respective hierarchy in the JSON.</p></sec><sec id="s2-4"><title>Model Training and Evaluation</title><p>The model was fine-tuned on the PIGEON dataset, which was split at the patient level into 90% training and 10% test sets. We used instruction-based fine-tuning, using prompts designed to extract key relevant medical entities and map them directly to their corresponding hierarchy in the intermediate JSON format. Training was conducted on a single NVIDIA A100 GPU, leveraging the Unsloth library [<xref ref-type="bibr" rid="ref26">26</xref>] for memory-efficient fine-tuning. Additionally, mixed-precision training (Floating-Point 16/Bfloat16 [used for mixed-precision training]) [<xref ref-type="bibr" rid="ref27">27</xref>] and gradient accumulation were implemented to optimize computational throughput and memory usage. Detailed hyperparameters are presented in Supplement C in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p><p>Furthermore, the model was evaluated for its capability to produce the correct intermediate JSON schema. The JSON output was flattened to compare the labels against the model extractions. Performance was quantified using the <italic>F</italic><sub>1</sub>-score, the harmonic mean of precision and recall, which balances the trade-off between completeness and correctness of extraction. All medical code predictions (<italic>International Classification of Diseases</italic> [<italic>ICD</italic>], Anatomical Therapeutic Chemical [ATC], Operation and Procedure Classification [OPS]) were evaluated using Jaccard similarity, as these fields represent unordered sets of variable length where set-level agreement is more informative than positional matching [<xref ref-type="bibr" rid="ref28">28</xref>], while remaining extractions with single expected values were evaluated through direct match, where exact correspondence is required. The model was evaluated using a synthetic and a real-world discharge letter test dataset in which predictions were reviewed by clinicians. The synthetic test set was used to assess the performance of the fine-tuned models on a large scale, whereas the real-world dataset evaluated the model&#x2019;s capability on actual clinical discharge letters.</p><p>For both evaluation sets, we compare the model with generalist models with one-shot prompting. To our knowledge, no publicly available fine-tuned open-source model currently exists for multientity clinical information extraction from German discharge letters into FHIR-compatible structured formats, making a comparison with an adapted task-specific baseline infeasible at the time of this study. A few-shot approach was also not viable due to context window constraints: German discharge letters can be lengthy documents, and the target JSON schema is itself large. The one-shot prompts were iteratively optimized to maximize performance within these constraints (Supplement A in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>).</p><p>To perform the human evaluation, a custom React-based web app was developed. The evaluation team included 3 experienced dermatology clinicians from University Hospital Essen and 1 medical student with 3 years of experience in medical text annotation. This group assessed outputs exclusively from the PIGEON and Qwen3 (235B) models, using a set of 30 discharge letters from various clinical departments. Scoring followed a case-level approach: each extracted clinical entity was treated as a composite case consisting of its constituent fields (eg, a diagnosis comprises the diagnosis name, the associated <italic>ICD</italic> code, and the date; a medication comprises the medication name, dosage, and ATC code; a laboratory value comprises the parameter name and its value). A case was marked as correct only if all constituent fields were accurately extracted; if any single field was incorrect or missing, the entire case was scored as wrong. This strict, all-or-nothing scoring ensures that the reported accuracy reflects clinically meaningful correctness rather than partial matches. For each instance, annotators verified the model&#x2019;s response and manually provided the correct answer if the model was wrong. To assess interannotator reliability, 6 discharge letters were independently annotated by 2 dermatology annotators, yielding 187 comparable annotation pairs across all clinical domains. Given the high workload required for this detailed review, this study was designed as a proof-of-concept study that prioritizes the depth and clinical accuracy of the evaluation over a large-scale dataset.</p></sec><sec id="s2-5"><title>Code Prediction Enhancement</title><p>Additionally, a postprocessing module using a retrieval-augmented generation (RAG) [<xref ref-type="bibr" rid="ref29">29</xref>] module to enhance <italic>International Classification of Diseases, 10th Revision</italic> (<italic>ICD-10</italic>), ATC, and OPS code prediction was implemented and evaluated. Each extracted entity display was postprocessed with a retrieval-augmented corrector to suppress code hallucinations. This enhancement system embeds the entire vocabulary of <italic>ICD</italic>, ATC, and OPS codes, sourced by merging official BfArM catalogs with institution-specific code-description mappings from our FHIR repository, using a multilingual sentence-transformer (paraphrase-multilingual-MiniLM-L12-v2) into a Facebook AI Similarity Search [<xref ref-type="bibr" rid="ref30">30</xref>] index; retrieves the top-10 semantically closest code candidates for the entity name via cosine similarity; supplements them with authoritative code descriptions fetched from the official website (only for <italic>ICD</italic> correction) [<xref ref-type="bibr" rid="ref31">31</xref>]; and then prompts a deterministic MedGemma 4B vLLM end point to pick the single most plausible code. The model is instructed to stay within the retrieved candidate set or the lookup vocabulary, and the chosen code replaces the raw generation in the final output JSON. When retrieval confidence is low (top-1 cosine similarity to the nearest code candidate below 0.9), the system falls back to the original model prediction to avoid introducing additional errors. All prompts used in the training, inference, and code prediction are provided in Supplement A in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>Cohort Characteristics</title><p>The PIGEON dataset comprised 41,175 patients (n=18,982, 46.1% female), with malignant neoplasms of the bronchi or lungs being the most prevalent diagnosis (<italic>ICD-10</italic> code C34, n=4076, 9.9%). Among those patients, encounter numbers varied across the cohort with a median number of encounters per patient of 27 (IQR 11&#x2010;56). While a substantial proportion (n=6876, 16.7%) of patients had between 31 and 50 encounters, given the chronic and complex nature of their conditions, primarily cancer, these patients typically have a protracted clinical journey spanning multiple years. This longitudinal course necessitates frequent clinical visits, each generating substantial volumes of FHIR resources, which are visualized in <xref ref-type="fig" rid="figure4">Figure 4</xref>. A comprehensive summary of the cohort&#x2019;s baseline characteristics, including age distribution, diagnostic counts, and quantification of FHIR resources per patient, is provided in <xref ref-type="table" rid="table2">Table 2</xref>.</p><fig position="float" id="figure4"><label>Figure 4.</label><caption><p>Overview of patient cohort demographics and clinical characteristics. (A) Top: distribution of health care use, measured by the range of medical encounters per patient. (B) Bottom left: the top 20 cancer sites within the cohort, ranked by the frequency of <italic>International Classification of Diseases, 10th Revision</italic> (<italic>ICD-10</italic>) code classifications. (C) Bottom right: population pyramid showing the distribution of patients by age and gender.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v28i1e92413_fig04.png"/></fig><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Demographic and clinical characteristics of the patient cohort<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup> (N=41,175).</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Category</td><td align="left" valign="bottom">Value</td></tr></thead><tbody><tr><td align="left" valign="top">Female, n (%)</td><td align="left" valign="top">18,752 (46.1)</td></tr><tr><td align="left" valign="top">Male, n (%)</td><td align="left" valign="top">21,943 (53.9)</td></tr><tr><td align="left" valign="top">Age at last encounter in years, mean (SD)</td><td align="left" valign="top">66.3 (16.8)</td></tr><tr><td align="left" valign="top">Encounters per patient, median (IQR)</td><td align="left" valign="top">27.0 (11.0&#x2010;56.0)</td></tr><tr><td align="left" valign="top">FHIR<sup><xref ref-type="table-fn" rid="table2fn2">b</xref></sup> resources per patient, median (IQR)</td><td align="left" valign="top">656 (190&#x2010;1941)</td></tr><tr><td align="left" valign="top">Main diagnoses per patient, median (IQR)</td><td align="left" valign="top">2 (1-3)</td></tr><tr><td align="left" valign="top">Patients with &#x003E;1 main diagnosis, n (%)</td><td align="left" valign="top">34,068 (82.7)</td></tr></tbody></table><table-wrap-foot><fn id="table2fn1"><p><sup>a</sup>The table summarizes the baseline data for the study cohort, detailing patient demographics including age and gender distribution. It further provides quantitative metrics for clinical data volume. </p></fn><fn id="table2fn2"><p><sup>b</sup>FHIR: Fast Healthcare Interoperability Resources.</p></fn></table-wrap-foot></table-wrap><p>To ensure comprehensive coverage of clinical scenarios and maximize the datasets&#x2019; generalizability, we analyzed the distribution of the main diagnoses. <xref ref-type="fig" rid="figure4">Figure 4B</xref> illustrates the localization of cancers among the patients. This distribution is based solely on the data available on the servers of the investigating site and is naturally influenced by regional cancer epidemiology. As depicted in <xref ref-type="fig" rid="figure4">Figure 4B</xref>, the dataset incorporates a broad spectrum of cancer cases. Furthermore, the distribution of encounters per patient, presented in <xref ref-type="fig" rid="figure4">Figure 4A</xref>, illustrates the variability in health care utilization and data density across the cohort. This factor influences the number of primary diagnoses per patient and helps understand the extent of concurrent clinical journeys a single patient has in the data repository.</p><p>To digitally represent these journeys, we used specific FHIR resources that serve as the foundational data units, encapsulating discrete medical information such as diagnoses, vital signs, medications, diagnostic reports, tumor information, and procedure details. Each patient journey in the study cohort included Patient, Encounter, Condition, Procedure, Observation, and MedicationStatement resources including detailed descriptions provided in Supplement G in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p><p>The collected FHIR data comprised 57 million resources, including 47 million Observations, 3.6 million Conditions, 2.1 million Procedures, and 646,000 MedicationStatements. It includes 10,000 unique <italic>ICD</italic> codes and 11,000 German OPS codes. This extensive and diverse data corpus provides a robust representation of a wide range of clinical scenarios, and the resulting dataset is designated as the FHIR cache.</p></sec><sec id="s3-2"><title>Synthetic Discharge Letter Evaluation</title><p><xref ref-type="table" rid="table3">Table 3</xref> reports PIGEON&#x2019;s performance against 3 general-purpose baselines across all evaluation categories. The largest performance gaps appear in the medical-code extraction tasks (full <italic>ICD</italic>, OPS, ATC), where domain-specific knowledge is most critical.</p><table-wrap id="t3" position="float"><label>Table 3.</label><caption><p>Benchmarking of the PIGEON<sup><xref ref-type="table-fn" rid="table3fn1">a</xref></sup> model against state-of-the-art large language models (LLMs) across stratified clinical data domains<sup><xref ref-type="table-fn" rid="table3fn2">b</xref></sup>.</p></caption><table id="table3" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom" colspan="3">Evaluation category and metric</td><td align="left" valign="bottom">PIGEON</td><td align="left" valign="bottom">Qwen</td><td align="left" valign="bottom">LLaMA</td><td align="left" valign="bottom">OSS</td></tr></thead><tbody><tr><td align="left" valign="top" colspan="7">Schema validity</td></tr><tr><td align="left" valign="top" colspan="3"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Valid JSON schema (%)</td><td align="left" valign="top">99.61</td><td align="left" valign="top">99.94</td><td align="left" valign="top">99.71</td><td align="left" valign="top">87.25</td></tr><tr><td align="left" valign="top" colspan="7">Diagnosis fields<sup><xref ref-type="table-fn" rid="table3fn3">c</xref></sup></td></tr><tr><td align="left" valign="top" colspan="3"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><italic>ICD</italic><sup><xref ref-type="table-fn" rid="table3fn4">d</xref></sup> chapter</td><td align="left" valign="top">0.9557</td><td align="left" valign="top">0.7789</td><td align="left" valign="top">0.8078</td><td align="left" valign="top">0.7096</td></tr><tr><td align="left" valign="top" colspan="3"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><italic>ICD</italic> category</td><td align="left" valign="top">0.8795</td><td align="left" valign="top">0.5969</td><td align="left" valign="top">0.5962</td><td align="left" valign="top">0.5464</td></tr><tr><td align="left" valign="top" colspan="3"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><italic>ICD</italic> (full code)</td><td align="left" valign="top">0.8395</td><td align="left" valign="top">0.5003</td><td align="left" valign="top">0.4784</td><td align="left" valign="top">0.4517</td></tr><tr><td align="left" valign="top" colspan="7">Tumor information field</td></tr><tr><td align="left" valign="top" colspan="3"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Average tumor-related score</td><td align="left" valign="top">0.9912</td><td align="left" valign="top">0.8752</td><td align="left" valign="top">0.8494</td><td align="left" valign="top">0.7287</td></tr><tr><td align="left" valign="top" colspan="7">Free-text fields<sup><xref ref-type="table-fn" rid="table3fn3">c</xref></sup></td></tr><tr><td align="left" valign="top" colspan="3"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Labs</td><td align="left" valign="top">0.9938</td><td align="left" valign="top">0.6750</td><td align="left" valign="top">0.6178</td><td align="left" valign="top">0.1429</td></tr><tr><td align="left" valign="top" colspan="3"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Medications</td><td align="left" valign="top">0.9776</td><td align="left" valign="top">0.8859</td><td align="left" valign="top">0.8566</td><td align="left" valign="top">0.7776</td></tr><tr><td align="left" valign="top" colspan="3"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Procedure codes</td><td align="left" valign="top">0.6345</td><td align="left" valign="top">0.1709</td><td align="left" valign="top">0.3550</td><td align="left" valign="top">0.0237</td></tr><tr><td align="left" valign="top" colspan="3"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Procedures</td><td align="left" valign="top">0.8972</td><td align="left" valign="top">0.5158</td><td align="left" valign="top">0.5602</td><td align="left" valign="top">0.2954</td></tr><tr><td align="left" valign="top" colspan="3"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Side diagnosis <italic>ICD</italic> chapter</td><td align="left" valign="top">0.8961</td><td align="left" valign="top">0.5641</td><td align="left" valign="top">0.4873</td><td align="left" valign="top">0.1482</td></tr><tr><td align="left" valign="top" colspan="3"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Side diagnosis <italic>ICD</italic> category</td><td align="left" valign="top">0.8026</td><td align="left" valign="top">0.3563</td><td align="left" valign="top">0.3385</td><td align="left" valign="top">0.1240</td></tr><tr><td align="left" valign="top" colspan="3"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Side diagnosis <italic>ICD</italic> (full code)</td><td align="left" valign="top">0.7516</td><td align="left" valign="top">0.2838</td><td align="left" valign="top">0.2685</td><td align="left" valign="top">0.1143</td></tr><tr><td align="left" valign="top" colspan="3"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Body values</td><td align="left" valign="top">0.9938</td><td align="left" valign="top">0.8464</td><td align="left" valign="top">0.8460</td><td align="left" valign="top">0.8687</td></tr><tr><td align="left" valign="top" colspan="7">Other data fields<sup><xref ref-type="table-fn" rid="table3fn3">c</xref></sup></td></tr><tr><td align="left" valign="top" colspan="3"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Introduction</td><td align="left" valign="top">0.9534</td><td align="left" valign="top">0.7261</td><td align="left" valign="top">0.7853</td><td align="left" valign="top">0.7897</td></tr><tr><td align="left" valign="top" colspan="3"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Lab values</td><td align="left" valign="top">0.9990</td><td align="left" valign="top">0.9827</td><td align="left" valign="top">0.9583</td><td align="left" valign="top">0.8504</td></tr><tr><td align="left" valign="top" colspan="3"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Medication ATC<sup><xref ref-type="table-fn" rid="table3fn5">e</xref></sup> codes</td><td align="left" valign="top">0.9368</td><td align="left" valign="top">0.8292</td><td align="left" valign="top">0.7554</td><td align="left" valign="top">0.7495</td></tr><tr><td align="left" valign="top" colspan="3"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Medication dosages</td><td align="left" valign="top">0.9987</td><td align="left" valign="top">0.9528</td><td align="left" valign="top">0.8877</td><td align="left" valign="top">0.8950</td></tr><tr><td align="left" valign="top" colspan="3"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Medication names</td><td align="left" valign="top">0.9997</td><td align="left" valign="top">0.9373</td><td align="left" valign="top">0.8342</td><td align="left" valign="top">0.8580</td></tr></tbody></table><table-wrap-foot><fn id="table3fn1"><p><sup>a</sup>PIGEON: Patient Information Generation from Organized Notes.</p></fn><fn id="table3fn2"><p><sup>b</sup>The PIGEON model was evaluated against 3 leading open-weights models: Qwen3 (235B), LLaMA 3.3 (70B), and GPT-OSS (120B). The results reflect raw model outputs; no postprocessing or external schema validation was applied. The scores represent the mean performance across 10 iterations of the test dataset. For all models, each inference run required approximately 1 hour to complete.</p></fn><fn id="table3fn3"><p><sup>c</sup>Performance is reported via <italic>F</italic><sub>1</sub>-scores. </p></fn><fn id="table3fn4"><p><sup>d</sup>ICD: <italic>International Classification of Diseases</italic>.</p></fn><fn id="table3fn5"><p><sup>e</sup>ATC: Anatomical Therapeutic Chemical.</p></fn></table-wrap-foot></table-wrap><p>The evaluation assesses PIGEON&#x2019;s performance across various categories, including the particularly challenging domain of medical code extraction, where PIGEON substantially outperforms all baselines. As shown in <xref ref-type="table" rid="table3">Table 3</xref>, generalist baselines experience drastic performance drops in these rigorous tasks; for instance, Qwen3 achieves only an <italic>F</italic><sub>1</sub>-score of 0.171 on procedure codes. In contrast, PIGEON maintains a strong macroaveraged <italic>F</italic><sub>1</sub>-score of 0.912 across all 17 evaluated metrics. Even on the granular full-code <italic>ICD</italic> task, it achieves an <italic>F</italic><sub>1</sub>-score of 0.840 (compared to Qwen3&#x2019;s 0.500) and outperforms baselines on procedure codes with an <italic>F</italic><sub>1</sub>-score of 0.635. Representative error examples illustrating the dominant failure modes of the baseline models (empty <italic>ICD</italic> and OPS codes, incorrect subdigits) are provided in Supplement E in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p><p>We present a quantitative evaluation in <xref ref-type="table" rid="table4">Table 4</xref> demonstrating how the RAG module improves code correction and overall prediction accuracy.</p><table-wrap id="t4" position="float"><label>Table 4.</label><caption><p>Quantitative impact of the RAG<sup><xref ref-type="table-fn" rid="table4fn1">a</xref></sup> postprocessing module on medical entity prediction.<sup><xref ref-type="table-fn" rid="table4fn2">b</xref></sup></p></caption><table id="table4" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Category</td><td align="left" valign="bottom">Before RAG</td><td align="left" valign="bottom">After RAG</td><td align="left" valign="bottom">Improvement</td></tr></thead><tbody><tr><td align="left" valign="top"><italic>ICD</italic><sup><xref ref-type="table-fn" rid="table4fn3">c</xref></sup> chapter</td><td align="left" valign="top">0.9579</td><td align="left" valign="top">0.9540</td><td align="left" valign="top">&#x2013;0.0039</td></tr><tr><td align="left" valign="top"><italic>ICD</italic> category</td><td align="left" valign="top">0.8795</td><td align="left" valign="top">0.9009</td><td align="left" valign="top">+0.0214</td></tr><tr><td align="left" valign="top"><italic>ICD</italic> (full code)</td><td align="left" valign="top">0.7314</td><td align="left" valign="top">0.8666</td><td align="left" valign="top">+0.1352</td></tr><tr><td align="left" valign="top">Procedure codes OPS<sup><xref ref-type="table-fn" rid="table4fn4">d</xref></sup></td><td align="left" valign="top">0.6345</td><td align="left" valign="top">0.7295</td><td align="left" valign="top">+0.095</td></tr><tr><td align="left" valign="top">Medication ATC<sup><xref ref-type="table-fn" rid="table4fn5">e</xref></sup> codes</td><td align="left" valign="top">0.8523</td><td align="left" valign="top">0.9178</td><td align="left" valign="top">+0.0655</td></tr><tr><td align="left" valign="top">Free-text side diagnosis <italic>ICD</italic> chapter</td><td align="left" valign="top">0.9077</td><td align="left" valign="top">0.9114</td><td align="left" valign="top">+0.0037</td></tr><tr><td align="left" valign="top">Free-text side diagnosis <italic>ICD</italic> category</td><td align="left" valign="top">0.8276</td><td align="left" valign="top">0.8400</td><td align="left" valign="top">+0.0124</td></tr><tr><td align="left" valign="top">Free-text side diagnosis <italic>ICD</italic> (full code)</td><td align="left" valign="top">0.6621</td><td align="left" valign="top">0.7972</td><td align="left" valign="top">+0.1351</td></tr></tbody></table><table-wrap-foot><fn id="table4fn1"><p><sup>a</sup>RAG: retrieval-augmented generation.</p></fn><fn id="table4fn2"><p><sup>b</sup>The table compares <italic>F</italic><sub>1</sub>-scores across diagnostic (<italic>International Classification of Diseases, 10th Revision</italic>), procedural (Operation and Procedure Classification), and pharmaceutical (Anatomical Therapeutic Chemical) domains before and after the application of the retrieval-based corrector.</p></fn><fn id="table4fn3"><p><sup>c</sup><italic>ICD</italic>: <italic>International Classification of Diseases</italic>.</p></fn><fn id="table4fn4"><p><sup>d</sup>OPS: Operation and Procedure Code.</p></fn><fn id="table4fn5"><p><sup>e</sup>ATC: Anatomical Therapeutic Chemical.</p></fn></table-wrap-foot></table-wrap><p>The RAG postprocessing module demonstrates clear effectiveness in correcting granular predictions, yielding F1 improvements of roughly 13.5 percentage points for full <italic>ICD</italic> codes and 9.5 percentage points for OPS procedures. Consequently, the system achieves significantly higher scores in assigning specific medical codes.</p></sec><sec id="s3-3"><title>Human Evaluation</title><p>The human-in-the-loop evaluation was conducted on 30 real-world discharge letters from various clinical departments. The quantitative results, summarized in <xref ref-type="table" rid="table5">Table 5</xref>, demonstrate that the PIGEON model achieved superior fidelity across most fields, recording an average accuracy of 87.5% (SD 10.6%) compared to 72.2% (SD 21.6%) for the general-purpose Qwen3-235B model.</p><table-wrap id="t5" position="float"><label>Table 5.</label><caption><p>Quantitative results of the human-in-the-loop validation.<sup><xref ref-type="table-fn" rid="table5fn1">a</xref></sup></p></caption><table id="table5" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom" rowspan="2">Category</td><td align="left" valign="bottom" colspan="3">PIGEON<sup><xref ref-type="table-fn" rid="table5fn2">b</xref></sup> model</td><td align="left" valign="bottom" colspan="3">Qwen3-235B</td></tr><tr><td align="left" valign="bottom">Correct (%)</td><td align="left" valign="bottom">Incorrect (%)</td><td align="left" valign="bottom">Letters, n</td><td align="left" valign="bottom">Correct (%)</td><td align="left" valign="bottom">Incorrect (%)</td><td align="left" valign="bottom">Letters, n</td></tr></thead><tbody><tr><td align="left" valign="top">Patient information</td><td align="left" valign="top">99.7</td><td align="left" valign="top">0.3</td><td align="left" valign="top">297</td><td align="left" valign="top">99.6</td><td align="left" valign="top">0.4</td><td align="left" valign="top">282</td></tr><tr><td align="left" valign="top">Main Diagnosis (Category)</td><td align="left" valign="top">78.9</td><td align="left" valign="top">21.1</td><td align="left" valign="top">237</td><td align="left" valign="top">75.8</td><td align="left" valign="top">24.2</td><td align="left" valign="top">194</td></tr><tr><td align="left" valign="top">Main Diagnosis (Chapter)</td><td align="left" valign="top">86.0</td><td align="left" valign="top">14.0</td><td align="left" valign="top">236</td><td align="left" valign="top">81.4</td><td align="left" valign="top">18.6</td><td align="left" valign="top">194</td></tr><tr><td align="left" valign="top">Free Text Diagnosis (Category)</td><td align="left" valign="top">80.1</td><td align="left" valign="top">19.9</td><td align="left" valign="top">156</td><td align="left" valign="top">53.3</td><td align="left" valign="top">46.7</td><td align="left" valign="top">167</td></tr><tr><td align="left" valign="top">Free Text Diagnosis (Chapter)</td><td align="left" valign="top">83.8</td><td align="left" valign="top">16.2</td><td align="left" valign="top">154</td><td align="left" valign="top">56.3</td><td align="left" valign="top">43.7</td><td align="left" valign="top">167</td></tr><tr><td align="left" valign="top">Medications</td><td align="left" valign="top">93.0</td><td align="left" valign="top">7.0</td><td align="left" valign="top">171</td><td align="left" valign="top">79.7</td><td align="left" valign="top">20.3</td><td align="left" valign="top">197</td></tr><tr><td align="left" valign="top">Free Text Medications</td><td align="left" valign="top">89.5</td><td align="left" valign="top">10.5</td><td align="left" valign="top">19</td><td align="left" valign="top">26.8</td><td align="left" valign="top">73.2</td><td align="left" valign="top">97</td></tr><tr><td align="left" valign="top">Laboratory Values</td><td align="left" valign="top">97.9</td><td align="left" valign="top">2.1</td><td align="left" valign="top">327</td><td align="left" valign="top">96.9</td><td align="left" valign="top">3.1</td><td align="left" valign="top">323</td></tr><tr><td align="left" valign="top">Free Text Laboratory Values</td><td align="left" valign="top">95.5</td><td align="left" valign="top">4.5</td><td align="left" valign="top">110</td><td align="left" valign="top">43.4</td><td align="left" valign="top">56.6</td><td align="left" valign="top">53</td></tr><tr><td align="left" valign="top">Procedures</td><td align="left" valign="top">61.3</td><td align="left" valign="top">38.7</td><td align="left" valign="top">137</td><td align="left" valign="top">43.5</td><td align="left" valign="top">56.5</td><td align="left" valign="top">138</td></tr><tr><td align="left" valign="top">Tumor Information</td><td align="left" valign="top">86.1</td><td align="left" valign="top">13.9</td><td align="left" valign="top">79</td><td align="left" valign="top">56.5</td><td align="left" valign="top">43.5</td><td align="left" valign="top">223</td></tr><tr><td align="left" valign="top">Average</td><td align="left" valign="top">87.5</td><td align="left" valign="top">12.5</td><td align="left" valign="top">1923</td><td align="left" valign="top">72.2</td><td align="left" valign="top">27.8</td><td align="left" valign="top">2035</td></tr></tbody></table><table-wrap-foot><fn id="table5fn1"><p><sup>a</sup>The table compares the accuracy of the PIGEON (Patient Information Generation from Organized Notes) model and Qwen3-235B when evaluated against an expert-verified reference dataset of 30 manually annotated discharge letters. Performance is categorized by clinical domain. The predictions include postprocessing with the retrieval-augmented generation corrector module.</p></fn><fn id="table5fn2"><p><sup>b</sup>PIGEON: Patient Information Generation from Organized Notes.</p></fn></table-wrap-foot></table-wrap><p>The largest differences in <xref ref-type="table" rid="table5">Table 5</xref> between PIGEON and Qwen3 appear in the free-text categories (Free Text Diagnoses, Free Text Medications, Free Text Lab Values), while procedures remain the weakest category for both systems. While the Qwen3-235B model demonstrated high performance given the complexity of the task, successfully identifying correct codes with the assistance of the RAG corrector, it frequently struggled with structural coherence. The evaluation revealed that Qwen3-235B often repeated information, misallocated data within the hierarchy, or appended irrelevant details to value fields. Hence, it produced more information than the PIGEON model, especially in tumor information and free-text medications. These structural inconsistencies impacted its performance in unstructured categories, most notably in Free Text Medications and Free Text Laboratory Values, where precise semantic mapping is critical. In contrast, qualitative feedback from human annotators highlighted the PIGEON model&#x2019;s reliability and precision. The PIGEON model exhibited negligible hallucination, adopting a conservative extraction strategy where it omitted fields rather than generating plausible but incorrect data when confidence was low. A detailed hierarchy-aware error analysis of the residual procedure errors addressing the OPS-catalog ambiguity is provided in Supplement F in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p><p>To assess the consistency of the human evaluation, interannotator agreement was calculated on the subset of 6 discharge letters that were independently reviewed by 2 experienced dermatologist annotators. Across 187 comparable annotation pairs (comprising entity-level judgments across all clinical domains), the annotators reached agreement on 164 (87.7%) cases and disagreed on 23 (12.3%) cases. Cohen &#x03BA; [<xref ref-type="bibr" rid="ref32">32</xref>] was 0.751, which falls within the &#x201C;substantial agreement&#x201D; range according to Landis and Koch (0.61&#x2010;0.80) [<xref ref-type="bibr" rid="ref33">33</xref>].</p></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Findings</title><p>We propose a framework intended to help mitigate bottlenecks in health care. By generating synthetic training data from a live FHIR server, this approach could serve as a reference for other institutions building secure, on-premise NLP solutions.</p><p>Through the use of the PIGEON dataset, created using the random sample generator, the fine-tuned PIGEON model achieved a competitive score on the FHIR resource generation task. The inclusion of popular open-source LLMs in the evaluation further underscored the efficacy of fine-tuned open-source models for downstream tasks. The PIGEON model achieved better performance than larger general-purpose models while offering the flexibility and cost-effectiveness associated with lower-parameter solutions, a finding consistent with recent work on resource-constrained clinical extraction [<xref ref-type="bibr" rid="ref34">34</xref>]. This finding aligns with the growing paradigm of small language models, where recent benchmarks indicate that compact, domain-specialized models can match the reasoning capabilities of massive generalist models in clinical settings while enabling efficient edge deployment [<xref ref-type="bibr" rid="ref35">35</xref>,<xref ref-type="bibr" rid="ref36">36</xref>].</p><p>In the context of related work, recent studies suggest that LLMs offer advantages over traditional NLP techniques for clinical information extraction [<xref ref-type="bibr" rid="ref34">34</xref>,<xref ref-type="bibr" rid="ref37">37</xref>-<xref ref-type="bibr" rid="ref39">39</xref>]. Within this domain, agentic workflows have emerged as a promising solution for complex reasoning tasks in health care [<xref ref-type="bibr" rid="ref17">17</xref>,<xref ref-type="bibr" rid="ref40">40</xref>]. However, comparative analysis is often constrained by the predominance of commercial, closed-source LLMs and heterogeneous evaluation metrics. While works using proprietary models (eg, GPT-4) demonstrate high efficacy, challenges persist regarding data governance and General Data Protection Regulation adherence [<xref ref-type="bibr" rid="ref41">41</xref>,<xref ref-type="bibr" rid="ref42">42</xref>]. This study differentiates itself from related work through its approach to dataset creation using real FHIR resources and by fine-tuning a resource-effective LLM. This approach allows for feasible on-premise implementation on clinical infrastructure where data privacy is critical, addressing the &#x201C;closed versus open&#x201D; deployment dilemma highlighted in recent regulatory frameworks [<xref ref-type="bibr" rid="ref41">41</xref>].</p><p>A distinct advantage of this work is its capability to process unstructured clinical text. While standard extract, transform, load (ETL) pipelines remain essential for managing structured data, our approach serves as a complementary extension for handling complex clinical narratives. This supports a future trajectory where robust ETL infrastructure and flexible LLM-driven solutions coexist to address the full spectrum of health care data. In a daily clinical routine, this framework is intended to operate as a background service that parses discharge letters and populates the EHR with pre-extracted information for clinician verification, validating recent hypotheses that LLMs can effectively automate unstructured-to-structured data conversion [<xref ref-type="bibr" rid="ref43">43</xref>,<xref ref-type="bibr" rid="ref44">44</xref>]. Although our quantitative evaluation used deterministic coding accuracy as a deliberately strict benchmark, clinical workflows rarely require physicians to verify granular medical codes directly. In most health care systems, <italic>ICD</italic> coding is performed by dedicated professional coders rather than treating physicians [<xref ref-type="bibr" rid="ref45">45</xref>]. What clinicians need at the point of care is the correct identification and display of clinical entities: diagnosis names, medication names, laboratory values, and procedure descriptions. Qualitative feedback from our physician annotators consistently indicated that when the model identifies a clinical entity, it provides a plausible and contextually appropriate display name while reliably avoiding negated or excluded findings. This suggests that PIGEON shows promise as a pragmatic, low-cost information extractor within routine clinical workflows. By surfacing structured clinical summaries for physician review, the framework could shift the clinician&#x2019;s role from manual data entry to verification of preextracted information. Combined with the structured output constraints and retrieval-grounded postprocessing validated in the <italic>Results</italic> section, this design supports output-level verifiability and maintains human-in-the-loop oversight, which will remain essential as the system progresses toward routine clinical use. Furthermore, the modular framework architecture comprising the fine-tuned extraction model, the RAG-based code corrector, and preprocessing modules allows each component to be independently improved, so that future enhancements could translate into gains at the point of care without requiring retraining of the entire system.</p><p>To operationalize this workflow, we developed a comprehensive clinical application integrating PIGEON, which is being prepared for a controlled pilot deployment in the Department of Dermatology at the University Hospital Essen. This implementation demonstrates the end-to-end framework under realistic operating conditions, including connections to preprocessing modules, such as Optical Character Recognition, to handle scanned clinical documents. A detailed explanation of this application&#x2019;s architecture and functionality is provided in Supplement B in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p><p>Compared to existing approaches for clinical information extraction, the synthetic-data fine-tuning paradigm introduced here offers distinct advantages in scalability and generalization. Traditional supervised methods depend on costly manual annotation of real clinical documents, which limits dataset size and introduces annotator bias. Rule-based and named entity recognition pipelines, while reliable for narrow extraction tasks, require extensive domain-specific engineering and do not generalize well across clinical settings or documentation styles. Proprietary LLM-based solutions can achieve strong zero-shot performance but are constrained by data governance requirements that prohibit the transfer of patient data to external services. In contrast, the PIGEON framework generates training data programmatically from structured FHIR resources already available in the institutional backend, eliminating the need for manual annotation entirely. Because the random sample generator operates on the FHIR cache, any institution that adopts the FHIR standard could in principle replicate this workflow with its own data, producing a fine-tuned model tailored to its local documentation patterns, including department-specific terminology and formatting conventions without sharing patient data externally. The synthetic-data evaluation, which spans discharge letters from various clinical departments, suggests that the approach is not specialty-bound, although this finding requires confirmation on real-world data from independent sites. Taken together, these results establish the technical feasibility of the approach; broader clinical adoption will require prospective multisite validation and formal regulatory assessment before routine use.</p><p>The study is subject to several limitations that constrain current conclusions. All evaluations were conducted at a single site, the University Hospital Essen, whose productive FHIR R4 server contains over 2 billion resources and is, to our knowledge, one of the largest single-institution FHIR repositories in Europe. While this scale supports a robust within-site evaluation, it does not establish that performance generalizes to hospitals with different documentation styles, FHIR profiles, or coding conventions. Cross-institutional validation was beyond the scope of this study under General Data Protection Regulation and our ethics approval (24&#x2010;12111-BO). The framework was also developed exclusively for German clinical documents, which restricts immediate generalizability to other languages. The real-world clinical evaluation was based on 30 discharge letters reviewed by clinicians from a single institution. Although the case-level scoring methodology was conservative and clinician collaboration ensured high data quality, this sample represents a narrow slice of documentation diversity, and broader multidepartmental validation would be required before routine clinical deployment. Procedure code extraction is the weakest domain in our evaluation and the most clinically relevant limitation for practical use. OPS codes are inherently difficult to extract: they encode anatomical site, laterality, technique, and access route in a single granular identifier and frequently require contextual inference from scattered narrative descriptions. Our hierarchy-aware analysis (Supplement F in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>) indicates that the errors are concentrated at well-recognized points of OPS-catalog ambiguity [<xref ref-type="bibr" rid="ref46">46</xref>] rather than reflecting random miscoding. A further, inherent limitation arises from our random synthetic-letter generation, whose training OPS distribution is shaped by the source FHIR repository rather than by the real-world oncology deployment cohort. In practical terms, procedure-related output should be treated as a structured draft that supports rather than replaces expert coding. On the modeling side, our experiments were restricted to fine-tuning a single LLM, leaving open whether alternative architectures or ensemble strategies could yield further improvements. The comparison against alternative systems is restricted to one-shot prompting of general-purpose open-source LLMs (Qwen3, LLaMA, GPT-OSS) and does not include a fine-tuned or task-specific baseline. No publicly available task-specific system for German clinical information extraction to FHIR existed at the time of this study, and as detailed in the <italic>Methods</italic> section, fine-tuning the baseline models on the same PIGEON dataset would have conflated framework contribution with architectural differences. The observed performance gap should therefore be interpreted as reflecting the advantage of domain-specific fine-tuning over prompt-based extraction under realistic deployment conditions, rather than as a claim of architectural superiority over alternative task-specific systems.</p><p>Looking ahead, the modular design of this framework offers potential for further extension, particularly when considering the rapid trajectory of LLM capabilities. While this study focused on German clinical documents, the methodology is inherently flexible; future work could extend this framework to multilingual datasets to facilitate cross-border interoperability [<xref ref-type="bibr" rid="ref23">23</xref>], while also upgrading the random sample generator itself. By leveraging more advanced, reasoning-capable LLMs as teacher models, the framework could move beyond structural randomization to synthesize complex, high-fidelity patient histories. This evolution would allow institutions to generate synthetic training data for edge cases, effectively solving the &#x201C;long-tail&#x201D; data scarcity problem without the privacy risks associated with real-world records. To mitigate the bottleneck of manual annotation, we suggest using an LLM-as-a-judge framework. Recent studies [<xref ref-type="bibr" rid="ref47">47</xref>,<xref ref-type="bibr" rid="ref48">48</xref>] suggest that stronger, reasoning-heavy models can serve as scalable automated evaluators for clinical NLP tasks, significantly expanding the validation set without incurring additional human cost. Addressing the remaining constraints by incorporating multilingual datasets represents a valuable direction for future research. In parallel with multilingual extension, prospective multisite validation across other German university hospitals is the most immediate next step. The German Medizininformatik-Initiative [<xref ref-type="bibr" rid="ref9">9</xref>] and its associated FHIR-aligned data integration centers provide a natural infrastructural basis for such a study, since participating sites already operate FHIR R4 backends with comparable resource profiles. This allows the random sample generator and fine-tuning workflow to be reinstantiated on each site&#x2019;s own data without transferring patient records, directly testing the framework&#x2019;s portability hypothesis under realistic data-governance constraints. Future investigations should also focus on combining this generative approach with multiple small language models by dividing the task into smaller subtasks to enhance computational efficiency. Furthermore, procedure code extraction would benefit from targeted training on large collections of real-world procedure descriptions paired with verified OPS codes and complemented by catalog-aware postprocessing that handles common multichapter ambiguity classes permissively. The inherent ambiguity of procedure-to-code mappings will, however, continue to pose challenges and motivates the continued use of expert review in the procedure-extraction workflow. Finally, as task-specific baselines for German clinical information extraction become available, benchmarking against such adapted systems would provide a more rigorous evaluation of the proposed architecture.</p></sec><sec id="s4-2"><title>Conclusions</title><p>This study presents and evaluates a privacy-preserving and resource-efficient framework for enhancing health care data interoperability. We demonstrate that a resource-efficient, open-source LLM, fine-tuned on a synthetic dataset derived from authentic FHIR resources, can transform unstructured German discharge letters into standardized formats with competitive accuracy. This approach complements traditional ETL pipelines and offers a secure alternative to large, proprietary models. As a single-site proof-of-concept conducted on one of Europe&#x2019;s largest institutional FHIR repositories, this work demonstrates the technical feasibility of the approach and provides a practical foundation for health care systems seeking to leverage unstructured clinical data. Prospective multisite validation across independent FHIR-aligned institutions remains a necessary next step before routine clinical use, with the long-term goal of improving operational efficiency and patient care.</p></sec></sec></body><back><ack><p>All figures were created with BioRender. The data for this project were provided by the Smart Hospital Information Platform, managed by the Data Integration Center at the University Medicine Essen. The Smart Hospital Information Platform serves as a comprehensive digital health platform for integrating data from all major clinical subsystems using a holistic Fast Healthcare Interoperability Resources&#x2013;based approach. It enables the purification, analysis, distribution, and visualization of clinical data. Generative artificial intelligence tools were used during the preparation of this manuscript for language editing and text refinement. All artificial intelligence&#x2013;generated suggestions were reviewed, verified, and edited by the authors. The authors take full responsibility for the accuracy and integrity of the final manuscript content.</p></ack><notes><sec><title>Funding</title><p>The work of BE, MB, TMGP, and HD was funded by a PhD grant from the DFG Research Training Group 2535 Knowledge&#x2013;based and data-based personalization of medicine at the point of care (WisPerMed).</p></sec><sec><title>Data Availability</title><p>The dataset used in this study is not publicly available. Individuals or academic organizations interested in using this dataset must submit a detailed request to Data-Governance@uk-essen.de, which will be reviewed on a case-by-case basis.</p><p>The code for the presented approach will be available on GitHub [<xref ref-type="bibr" rid="ref49">49</xref>] with an end-to-end pipeline and a working example.</p></sec></notes><fn-group><fn fn-type="con"><p>BE, KA, FN, and RH conceptualized the study, extracted and preprocessed the data, developed the codebase, executed the experiments, and wrote the manuscript. MB, HD, SW, HS, AI-Y, TMGP, and KB provided methodological support. JK and CMF supported the technical infrastructure. LJA, GL, EL, and DS provided evaluation support and clinical feedback. All authors critically revised the manuscript and approved the final version.</p></fn><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations:</title><def-list><def-item><term id="abb1">ATC</term><def><p>Anatomical Therapeutic Chemical</p></def></def-item><def-item><term id="abb2">EHR</term><def><p>electronic health record</p></def></def-item><def-item><term id="abb3">ETL</term><def><p> extract, transform, load</p></def></def-item><def-item><term id="abb4">FHIR</term><def><p>Fast Healthcare Interoperability Resources</p></def></def-item><def-item><term id="abb5"><italic>ICD-10</italic></term><def><p><italic>International Classification of Diseases, 10th Revision</italic></p></def></def-item><def-item><term id="abb6">LLM</term><def><p>large language model</p></def></def-item><def-item><term id="abb7">NLP</term><def><p>natural language processing</p></def></def-item><def-item><term id="abb8">OPS</term><def><p>Operation and Procedure Classification</p></def></def-item><def-item><term id="abb9">PIGEON</term><def><p>Patient Information Generation from Organized Notes</p></def></def-item><def-item><term id="abb10">RAG</term><def><p>retrieval-augmented generation</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hripcsak</surname><given-names>G</given-names> </name><name name-style="western"><surname>Albers</surname><given-names>DJ</given-names> </name></person-group><article-title>Next-generation phenotyping of electronic health records</article-title><source>J Am Med Inform Assoc</source><year>2013</year><month>01</month><day>1</day><volume>20</volume><issue>1</issue><fpage>117</fpage><lpage>121</lpage><pub-id pub-id-type="doi">10.1136/amiajnl-2012-001145</pub-id><pub-id pub-id-type="medline">22955496</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Tayefi</surname><given-names>M</given-names> </name><name name-style="western"><surname>Ngo</surname><given-names>P</given-names> </name><name name-style="western"><surname>Chomutare</surname><given-names>T</given-names> </name><etal/></person-group><article-title>Challenges and opportunities beyond structured data in analysis of electronic health records</article-title><source>WIREs Comput Stat</source><year>2021</year><volume>13</volume><issue>6</issue><fpage>e1549</fpage><pub-id pub-id-type="doi">10.1002/wics.1549</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kong</surname><given-names>HJ</given-names> </name></person-group><article-title>Managing unstructured big data in healthcare system</article-title><source>Healthc Inform Res</source><year>2019</year><month>01</month><volume>25</volume><issue>1</issue><fpage>1</fpage><lpage>2</lpage><pub-id pub-id-type="doi">10.4258/hir.2019.25.1.1</pub-id><pub-id pub-id-type="medline">30788175</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Edmondson</surname><given-names>ME</given-names> </name><name name-style="western"><surname>Reimer</surname><given-names>AP</given-names> </name></person-group><article-title>Challenges frequently encountered in the secondary use of electronic medical record data for research</article-title><source>Comput Inform Nurs</source><year>2020</year><month>07</month><volume>38</volume><issue>7</issue><fpage>338</fpage><lpage>348</lpage><pub-id pub-id-type="doi">10.1097/CIN.0000000000000609</pub-id><pub-id pub-id-type="medline">32149742</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bockhacker</surname><given-names>M</given-names> </name><name name-style="western"><surname>Martens</surname><given-names>P</given-names> </name><name name-style="western"><surname>von M&#x00FC;nchow</surname><given-names>C</given-names> </name><etal/></person-group><article-title>Lessons learned from building a data platform for longitudinal, analytical use cases and scaling to 77 German hospitals: implementation report</article-title><source>JMIR Med Inform</source><year>2025</year><month>09</month><day>12</day><volume>13</volume><fpage>e69853</fpage><pub-id pub-id-type="doi">10.2196/69853</pub-id><pub-id pub-id-type="medline">40939633</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Golburean</surname><given-names>O</given-names> </name><name name-style="western"><surname>Pedersen</surname><given-names>R</given-names> </name><name name-style="western"><surname>Melby</surname><given-names>L</given-names> </name><name name-style="western"><surname>Faxvaag</surname><given-names>A</given-names> </name></person-group><article-title>Exploring physicians&#x2019; dual perspectives on the transition from free text to structured and standardized documentation practices: interview and participant observational study</article-title><source>JMIR Form Res</source><year>2025</year><month>03</month><day>21</day><volume>9</volume><fpage>e63902</fpage><pub-id pub-id-type="doi">10.2196/63902</pub-id><pub-id pub-id-type="medline">40117572</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>L</given-names> </name><name name-style="western"><surname>Rastegar-Mojarad</surname><given-names>M</given-names> </name><etal/></person-group><article-title>Clinical information extraction applications: a literature review</article-title><source>J Biomed Inform</source><year>2018</year><month>01</month><volume>77</volume><fpage>34</fpage><lpage>49</lpage><pub-id pub-id-type="doi">10.1016/j.jbi.2017.11.011</pub-id><pub-id pub-id-type="medline">29162496</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Bender</surname><given-names>D</given-names> </name><name name-style="western"><surname>Sartipi</surname><given-names>K</given-names> </name></person-group><article-title>HL7 FHIR: an Agile and RESTful approach to healthcare information exchange</article-title><conf-name>Proceedings of the 26th IEEE International Symposium on Computer-Based Medical Systems</conf-name><conf-date>Jun 20-22, 2013</conf-date><pub-id pub-id-type="doi">10.1109/CBMS.2013.6627810</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Semler</surname><given-names>SC</given-names> </name><name name-style="western"><surname>Wissing</surname><given-names>F</given-names> </name><name name-style="western"><surname>Heyder</surname><given-names>R</given-names> </name></person-group><article-title>German Medical Informatics Initiative</article-title><source>Methods Inf Med</source><year>2018</year><month>07</month><volume>57</volume><issue>S 01</issue><fpage>e50</fpage><lpage>e56</lpage><pub-id pub-id-type="doi">10.3414/ME18-03-0003</pub-id><pub-id pub-id-type="medline">30016818</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="report"><article-title>Regulation (EU) 2025/327 of the European Parliament and of the Council of 11 February 2025 on the European Health Data Space and amending Directive 2011/24/EU and Regulation (EU) 2024/2847 (Text with EEA relevance)</article-title><year>2025</year><access-date>2025-12-22</access-date><publisher-name>European Union</publisher-name><comment><ext-link ext-link-type="uri" xlink:href="http://data.europa.eu/eli/reg/2025/327/oj">http://data.europa.eu/eli/reg/2025/327/oj</ext-link></comment></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Li</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>H</given-names> </name><name name-style="western"><surname>Yerebakan</surname><given-names>HZ</given-names> </name><name name-style="western"><surname>Shinagawa</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Luo</surname><given-names>Y</given-names> </name></person-group><article-title>FHIR-GPT enhances health interoperability with large language models</article-title><source>NEJM AI</source><year>2024</year><month>08</month><volume>1</volume><issue>8</issue><pub-id pub-id-type="doi">10.1056/aics2300301</pub-id><pub-id pub-id-type="medline">40746832</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>J</given-names> </name><name name-style="western"><surname>Mathews</surname><given-names>WC</given-names> </name><name name-style="western"><surname>Pham</surname><given-names>HA</given-names> </name><name name-style="western"><surname>Xu</surname><given-names>H</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>Y</given-names> </name></person-group><article-title>Opioid2FHIR: a system for extracting FHIR-compatible opioid prescriptions from clinical text</article-title><conf-name>2020 IEEE International Conference on Bioinformatics and Biomedicine (BIBM)</conf-name><conf-date>Dec 16-19, 2020</conf-date><pub-id pub-id-type="doi">10.1109/BIBM49941.2020.9313258</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Ghali</surname><given-names>MK</given-names> </name><name name-style="western"><surname>Farrag</surname><given-names>A</given-names> </name><name name-style="western"><surname>Sakai</surname><given-names>H</given-names> </name></person-group><article-title>GAMedX: generative AI-based medical entity data extractor using large language models</article-title><source>arXiv</source><comment>Preprint posted online on  May 31, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2405.20585</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sohn</surname><given-names>S</given-names> </name><name name-style="western"><surname>Clark</surname><given-names>C</given-names> </name><name name-style="western"><surname>Halgrim</surname><given-names>SR</given-names> </name><name name-style="western"><surname>Murphy</surname><given-names>SP</given-names> </name><name name-style="western"><surname>Chute</surname><given-names>CG</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>H</given-names> </name></person-group><article-title>MedXN: an open source medication extraction and normalization tool for clinical text</article-title><source>J Am Med Inform Assoc</source><year>2014</year><volume>21</volume><issue>5</issue><fpage>858</fpage><lpage>865</lpage><pub-id pub-id-type="doi">10.1136/amiajnl-2013-002190</pub-id><pub-id pub-id-type="medline">24637954</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Savova</surname><given-names>GK</given-names> </name><name name-style="western"><surname>Masanz</surname><given-names>JJ</given-names> </name><name name-style="western"><surname>Ogren</surname><given-names>PV</given-names> </name><etal/></person-group><article-title>Mayo clinical Text Analysis and Knowledge Extraction System (cTAKES): architecture, component evaluation and applications</article-title><source>J Am Med Inform Assoc</source><year>2010</year><volume>17</volume><issue>5</issue><fpage>507</fpage><lpage>513</lpage><pub-id pub-id-type="doi">10.1136/jamia.2009.001560</pub-id><pub-id pub-id-type="medline">20819853</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hou</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Jiang</surname><given-names>M</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>H</given-names> </name><name name-style="western"><surname>Zhuang</surname><given-names>Y</given-names> </name></person-group><article-title>LLM-Integrated Normalization and Knowledge for FHIR (LINK-FHIR)</article-title><source>Stud Health Technol Inform</source><year>2025</year><month>08</month><day>7</day><volume>329</volume><fpage>17</fpage><lpage>21</lpage><pub-id pub-id-type="doi">10.3233/SHTI250793</pub-id><pub-id pub-id-type="medline">40775811</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Frei</surname><given-names>J</given-names> </name><name name-style="western"><surname>Feldhus</surname><given-names>N</given-names> </name><name name-style="western"><surname>Raithel</surname><given-names>L</given-names> </name><name name-style="western"><surname>Roller</surname><given-names>R</given-names> </name><name name-style="western"><surname>Meyer</surname><given-names>A</given-names> </name><name name-style="western"><surname>Kramer</surname><given-names>F</given-names> </name></person-group><article-title>Infherno: end-to-end agent-based FHIR resource synthesis from free-form clinical notes</article-title><conf-name>Proceedings of the 19th Conference of the European Chapter of the Association for Computational Linguistics</conf-name><conf-date>Mar 24-29, 2026</conf-date><pub-id pub-id-type="doi">10.18653/v1/2026.eacl-demo.13</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Arzideh</surname><given-names>K</given-names> </name><name name-style="western"><surname>Sch&#x00E4;fer</surname><given-names>H</given-names> </name><name name-style="western"><surname>Allende-Cid</surname><given-names>H</given-names> </name><etal/></person-group><article-title>From BERT to generative AI&#x2014;comparing encoder-only vs. large language models in a cohort of lung cancer patients for named entity recognition in unstructured medical reports</article-title><source>Comput Biol Med</source><year>2025</year><month>09</month><volume>195</volume><fpage>110665</fpage><pub-id pub-id-type="doi">10.1016/j.compbiomed.2025.110665</pub-id><pub-id pub-id-type="medline">40554973</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Majid</surname><given-names>I</given-names> </name><name name-style="western"><surname>Mishra</surname><given-names>V</given-names> </name><name name-style="western"><surname>Ravindranath</surname><given-names>R</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>SY</given-names> </name></person-group><article-title>Evaluating the performance of large language models for named entity recognition in ophthalmology clinical free-text notes</article-title><source>AMIA Annu Symp Proc</source><year>2025</year><volume>2024</volume><fpage>778</fpage><lpage>787</lpage><pub-id pub-id-type="medline">40417582</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lenz</surname><given-names>S</given-names> </name><name name-style="western"><surname>Ustjanzew</surname><given-names>A</given-names> </name><name name-style="western"><surname>Jeray</surname><given-names>M</given-names> </name><name name-style="western"><surname>Ressing</surname><given-names>M</given-names> </name><name name-style="western"><surname>Panholzer</surname><given-names>T</given-names> </name></person-group><article-title>Can open source large language models be used for tumor documentation in Germany?&#x2014;An evaluation on urological doctors&#x2019; notes</article-title><source>BioData Min</source><year>2025</year><month>07</month><day>24</day><volume>18</volume><issue>1</issue><fpage>48</fpage><pub-id pub-id-type="doi">10.1186/s13040-025-00463-8</pub-id><pub-id pub-id-type="medline">40707949</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Spiegel</surname><given-names>S</given-names> </name><name name-style="western"><surname>Yimam</surname><given-names>SM</given-names> </name><name name-style="western"><surname>Breitfeld</surname><given-names>P</given-names> </name><name name-style="western"><surname>&#x00DC;ckert</surname><given-names>F</given-names> </name></person-group><article-title>Adaption and evaluation of generative large language models for German medical information extraction</article-title><access-date>2026-06-12</access-date><conf-name>Proceedings of the 21st Conference on Natural Language Processing (KONVENS 2025): Long and Short Papers</conf-name><conf-date>Sep 9-12, 2025</conf-date><comment><ext-link ext-link-type="uri" xlink:href="https://aclanthology.org/2025.konvens-1.4.pdf">https://aclanthology.org/2025.konvens-1.4.pdf</ext-link></comment></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Goyal</surname><given-names>M</given-names> </name><name name-style="western"><surname>Mahmoud</surname><given-names>QH</given-names> </name></person-group><article-title>A systematic review of synthetic data generation techniques using generative AI</article-title><source>Electronics</source><year>2024</year><volume>13</volume><issue>17</issue><fpage>3509</fpage><pub-id pub-id-type="doi">10.3390/electronics13173509</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Nad&#x01CE;&#x015F;</surname><given-names>M</given-names> </name><name name-style="western"><surname>Dio&#x015F;an</surname><given-names>L</given-names> </name><name name-style="western"><surname>Tomescu</surname><given-names>A</given-names> </name></person-group><article-title>Synthetic data generation using large language models: advances in text and code</article-title><source>IEEE Access</source><year>2025</year><volume>13</volume><fpage>134615</fpage><lpage>134633</lpage><pub-id pub-id-type="doi">10.1109/ACCESS.2025.3589503</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Baumel</surname><given-names>T</given-names> </name><name name-style="western"><surname>Manoel</surname><given-names>A</given-names> </name><name name-style="western"><surname>Jones</surname><given-names>D</given-names> </name><etal/></person-group><article-title>Controllable synthetic clinical note generation with privacy guarantees</article-title><source>arXiv</source><comment>Preprint posted online on  Sep 12, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2409.07809</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Yang</surname><given-names>A</given-names> </name><name name-style="western"><surname>Li</surname><given-names>A</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>B</given-names> </name><etal/></person-group><article-title>Qwen3 technical report</article-title><source>arXiv</source><comment>Preprint posted online on  May 14, 2025</comment><pub-id pub-id-type="doi">10.48550/arXiv.2505.09388</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="web"><article-title>unslothai/unsloth</article-title><source>GitHub</source><access-date>2025-11-16</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://github.com/unslothai/unsloth">https://github.com/unslothai/unsloth</ext-link></comment></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Ina</surname><given-names>T</given-names> </name><name name-style="western"><surname>Idomura</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Imamura</surname><given-names>T</given-names> </name><name name-style="western"><surname>Onodera</surname><given-names>N</given-names> </name></person-group><article-title>A new data conversion method for mixed precision Krylov solvers with FP16/BF16 Jacobi preconditioners</article-title><conf-name>Proceedings of the International Conference on High Performance Computing in Asia-Pacific Region</conf-name><conf-date>Feb 27 to Mar 2, 2023</conf-date><pub-id pub-id-type="doi">10.1145/3578178.3578222</pub-id></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Jaccard</surname><given-names>P</given-names> </name></person-group><article-title>&#x00C9;tude de la distribution florale dans une portion des Alpes et du Jura [Article in French]</article-title><source>Bull Soc Vaudoise Sci Nat</source><year>1901</year><volume>37</volume><issue>142</issue><fpage>547</fpage><lpage>579</lpage><pub-id pub-id-type="doi">10.5169/seals-266450</pub-id></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Lewis</surname><given-names>P</given-names> </name><name name-style="western"><surname>Perez</surname><given-names>E</given-names> </name><name name-style="western"><surname>Piktus</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Retrieval-augmented generation for knowledge-intensive NLP tasks</article-title><access-date>2026-06-12</access-date><conf-name>Proceedings of the 34th International Conference on Neural Information Processing Systems</conf-name><conf-date>Dec 6-12, 2020</conf-date><comment><ext-link ext-link-type="uri" xlink:href="https://dl.acm.org/doi/abs/10.5555/3495724.3496517">https://dl.acm.org/doi/abs/10.5555/3495724.3496517</ext-link></comment></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Douze</surname><given-names>M</given-names> </name><name name-style="western"><surname>Guzhva</surname><given-names>A</given-names> </name><name name-style="western"><surname>Deng</surname><given-names>C</given-names> </name><etal/></person-group><article-title>The Faiss Library</article-title><source>IEEE Trans Big Data</source><year>2026</year><volume>12</volume><issue>2</issue><fpage>346</fpage><lpage>361</lpage><pub-id pub-id-type="doi">10.1109/TBDATA.2025.3618474</pub-id></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="web"><article-title>ICD&#x2011;Code&#x2011;Suche [Article in German]</article-title><source>gesund.bund.de</source><access-date>2025-11-16</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://gesund.bund.de/icd-code-suche">https://gesund.bund.de/icd-code-suche</ext-link></comment></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Cohen</surname><given-names>J</given-names> </name></person-group><article-title>A coefficient of agreement for nominal scales</article-title><source>Educ Psychol Meas</source><year>1960</year><month>04</month><volume>20</volume><issue>1</issue><fpage>37</fpage><lpage>46</lpage><pub-id pub-id-type="doi">10.1177/001316446002000104</pub-id></nlm-citation></ref><ref id="ref33"><label>33</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Landis</surname><given-names>JR</given-names> </name><name name-style="western"><surname>Koch</surname><given-names>GG</given-names> </name></person-group><article-title>The measurement of observer agreement for categorical data</article-title><source>Biometrics</source><year>1977</year><month>03</month><volume>33</volume><issue>1</issue><fpage>159</fpage><lpage>174</lpage><pub-id pub-id-type="doi">10.2307/2529310</pub-id><pub-id pub-id-type="medline">843571</pub-id></nlm-citation></ref><ref id="ref34"><label>34</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Builtjes</surname><given-names>L</given-names> </name><name name-style="western"><surname>Bosma</surname><given-names>J</given-names> </name><name name-style="western"><surname>Prokop</surname><given-names>M</given-names> </name><name name-style="western"><surname>van Ginneken</surname><given-names>B</given-names> </name><name name-style="western"><surname>Hering</surname><given-names>A</given-names> </name></person-group><article-title>Leveraging open-source large language models for clinical information extraction in resource-constrained settings</article-title><source>JAMIA Open</source><year>2025</year><month>10</month><volume>8</volume><issue>5</issue><fpage>ooaf109</fpage><pub-id pub-id-type="doi">10.1093/jamiaopen/ooaf109</pub-id><pub-id pub-id-type="medline">41041625</pub-id></nlm-citation></ref><ref id="ref35"><label>35</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Garg</surname><given-names>M</given-names> </name><name name-style="western"><surname>Raza</surname><given-names>S</given-names> </name><name name-style="western"><surname>Rayana</surname><given-names>S</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>X</given-names> </name><name name-style="western"><surname>Sohn</surname><given-names>S</given-names> </name></person-group><article-title>The rise of small language models in healthcare: a comprehensive survey</article-title><source>Comput Sci Rev</source><year>2026</year><month>11</month><volume>62</volume><fpage>100999</fpage><pub-id pub-id-type="doi">10.1016/j.cosrev.2026.100999</pub-id><pub-id pub-id-type="medline">42222656</pub-id></nlm-citation></ref><ref id="ref36"><label>36</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Wu</surname><given-names>J</given-names> </name><name name-style="western"><surname>Teitge</surname><given-names>B</given-names> </name><name name-style="western"><surname>Holodinsky</surname><given-names>J</given-names> </name><name name-style="western"><surname>Drew</surname><given-names>S</given-names> </name></person-group><article-title>Small language models for emergency departments decision support: a benchmark study</article-title><conf-name>2025 IEEE Smart World Congress (SWC)</conf-name><conf-date>Aug 18-22, 2025</conf-date><pub-id pub-id-type="doi">10.1109/SWC65939.2025.00239</pub-id></nlm-citation></ref><ref id="ref37"><label>37</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Clusmann</surname><given-names>J</given-names> </name><name name-style="western"><surname>Kolbinger</surname><given-names>FR</given-names> </name><name name-style="western"><surname>Muti</surname><given-names>HS</given-names> </name><etal/></person-group><article-title>The future landscape of large language models in medicine</article-title><source>Commun Med (Lond)</source><year>2023</year><month>10</month><day>10</day><volume>3</volume><issue>1</issue><fpage>141</fpage><pub-id pub-id-type="doi">10.1038/s43856-023-00370-1</pub-id><pub-id pub-id-type="medline">37816837</pub-id></nlm-citation></ref><ref id="ref38"><label>38</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hu</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Zuo</surname><given-names>X</given-names> </name><name name-style="western"><surname>Zhou</surname><given-names>Y</given-names> </name><etal/></person-group><article-title>Information extraction from clinical notes: are we ready to switch to large language models?</article-title><source>J Am Med Inform Assoc</source><year>2026</year><month>03</month><day>1</day><volume>33</volume><issue>3</issue><fpage>553</fpage><lpage>562</lpage><pub-id pub-id-type="doi">10.1093/jamia/ocaf213</pub-id><pub-id pub-id-type="medline">41533750</pub-id></nlm-citation></ref><ref id="ref39"><label>39</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Menezes</surname><given-names>MCS</given-names> </name><name name-style="western"><surname>Hoffmann</surname><given-names>AF</given-names> </name><name name-style="western"><surname>Tan</surname><given-names>ALM</given-names> </name><etal/></person-group><article-title>The potential of Generative Pre-trained Transformer 4 (GPT-4) to analyse medical notes in three different languages: a retrospective model-evaluation study</article-title><source>Lancet Digit Health</source><year>2025</year><month>01</month><volume>7</volume><issue>1</issue><fpage>e35</fpage><lpage>e43</lpage><pub-id pub-id-type="doi">10.1016/S2589-7500(24)00246-2</pub-id><pub-id pub-id-type="medline">39722251</pub-id></nlm-citation></ref><ref id="ref40"><label>40</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Qiu</surname><given-names>J</given-names> </name><name name-style="western"><surname>Lam</surname><given-names>K</given-names> </name><name name-style="western"><surname>Li</surname><given-names>G</given-names> </name><etal/></person-group><article-title>LLM-based agentic systems in medicine and healthcare</article-title><source>Nat Mach Intell</source><year>2024</year><volume>6</volume><issue>12</issue><fpage>1418</fpage><lpage>1420</lpage><pub-id pub-id-type="doi">10.1038/s42256-024-00944-1</pub-id></nlm-citation></ref><ref id="ref41"><label>41</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Dennst&#x00E4;dt</surname><given-names>F</given-names> </name><name name-style="western"><surname>Hastings</surname><given-names>J</given-names> </name><name name-style="western"><surname>Putora</surname><given-names>PM</given-names> </name><name name-style="western"><surname>Schmerder</surname><given-names>M</given-names> </name><name name-style="western"><surname>Cihoric</surname><given-names>N</given-names> </name></person-group><article-title>Implementing large language models in healthcare while balancing control, collaboration, costs and security</article-title><source>NPJ Digit Med</source><year>2025</year><month>03</month><day>6</day><volume>8</volume><issue>1</issue><fpage>143</fpage><pub-id pub-id-type="doi">10.1038/s41746-025-01476-7</pub-id><pub-id pub-id-type="medline">40050366</pub-id></nlm-citation></ref><ref id="ref42"><label>42</label><nlm-citation citation-type="report"><article-title>AI privacy risks &#x0026; mitigations large language models (LLMs)</article-title><year>2025</year><access-date>2025-12-12</access-date><publisher-name>European Data Protection Board (EDPB)</publisher-name><comment><ext-link ext-link-type="uri" xlink:href="https://www.edpb.europa.eu/system/files/2025-04/ai-privacy-risks-and-mitigations-in-llms.pdf">https://www.edpb.europa.eu/system/files/2025-04/ai-privacy-risks-and-mitigations-in-llms.pdf</ext-link></comment></nlm-citation></ref><ref id="ref43"><label>43</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Delaunay</surname><given-names>J</given-names> </name><name name-style="western"><surname>Girbes</surname><given-names>D</given-names> </name><name name-style="western"><surname>Cusido</surname><given-names>J</given-names> </name></person-group><article-title>Evaluating the effectiveness of large language models in converting clinical data to FHIR format</article-title><source>Appl Sci</source><year>2025</year><volume>15</volume><issue>6</issue><fpage>3379</fpage><pub-id pub-id-type="doi">10.3390/app15063379</pub-id></nlm-citation></ref><ref id="ref44"><label>44</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Brach</surname><given-names>W</given-names> </name><name name-style="western"><surname>Ko&#x0161;&#x0165;&#x00E1;l</surname><given-names>K</given-names> </name><name name-style="western"><surname>Ries</surname><given-names>M</given-names> </name></person-group><article-title>The effectiveness of large language models in transforming unstructured text to standardized formats</article-title><source>IEEE Access</source><year>2025</year><volume>13</volume><fpage>91808</fpage><lpage>91825</lpage><pub-id pub-id-type="doi">10.1109/ACCESS.2025.3573030</pub-id></nlm-citation></ref><ref id="ref45"><label>45</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Otero Varela</surname><given-names>L</given-names> </name><name name-style="western"><surname>Doktorchik</surname><given-names>C</given-names> </name><name name-style="western"><surname>Wiebe</surname><given-names>N</given-names> </name><etal/></person-group><article-title>International Classification of Diseases clinical coding training: an international survey</article-title><source>Health Inf Manag</source><year>2024</year><month>05</month><volume>53</volume><issue>2</issue><fpage>68</fpage><lpage>75</lpage><pub-id pub-id-type="doi">10.1177/18333583221106509</pub-id><pub-id pub-id-type="medline">35838185</pub-id></nlm-citation></ref><ref id="ref46"><label>46</label><nlm-citation citation-type="report"><article-title>Kodierleitfaden H&#x00E4;matologie, Onkologie und Stammzelltransplantation, Version 2020 [Report in German]</article-title><year>2020</year><access-date>2026-06-12</access-date><publisher-name>Deutsche Gesellschaft f&#x00FC;r H&#x00E4;matologie und Medizinische Onkologie e.V. (DGHO e.V.)</publisher-name><comment><ext-link ext-link-type="uri" xlink:href="https://www.dgho.de/arbeitskreise/a-g/drg-gesundheitsoekonomie/kodierleitfaden/dgho-kodierleitfaden_2020_buch_ak2_final.pdf">https://www.dgho.de/arbeitskreise/a-g/drg-gesundheitsoekonomie/kodierleitfaden/dgho-kodierleitfaden_2020_buch_ak2_final.pdf</ext-link></comment></nlm-citation></ref><ref id="ref47"><label>47</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kocaman</surname><given-names>V</given-names> </name><name name-style="western"><surname>Kaya</surname><given-names>MA</given-names> </name><name name-style="western"><surname>Feier</surname><given-names>AM</given-names> </name><name name-style="western"><surname>Talby</surname><given-names>D</given-names> </name></person-group><article-title>Clinical large language model evaluation by expert review (CLEVER): framework development and validation</article-title><source>JMIR AI</source><year>2025</year><month>12</month><day>4</day><volume>4</volume><fpage>e72153</fpage><pub-id pub-id-type="doi">10.2196/72153</pub-id><pub-id pub-id-type="medline">41343765</pub-id></nlm-citation></ref><ref id="ref48"><label>48</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Laskar</surname><given-names>MTR</given-names> </name><name name-style="western"><surname>Jahan</surname><given-names>I</given-names> </name><name name-style="western"><surname>Dolatabadi</surname><given-names>E</given-names> </name><name name-style="western"><surname>Peng</surname><given-names>C</given-names> </name><name name-style="western"><surname>Hoque</surname><given-names>E</given-names> </name><name name-style="western"><surname>Huang</surname><given-names>JX</given-names> </name></person-group><article-title>Improving automatic evaluation of large language models (LLMs) in biomedical relation extraction via LLMs-as-the-judge</article-title><conf-name>Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)</conf-name><conf-date>Jul 27 to Aug 1, 2025</conf-date><pub-id pub-id-type="doi">10.18653/v1/2025.acl-long.1238</pub-id></nlm-citation></ref><ref id="ref49"><label>49</label><nlm-citation citation-type="web"><article-title>UMEssen/PIGEON</article-title><source>GitHub</source><access-date>2026-06-17</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://github.com/UMEssen/PIGEON">https://github.com/UMEssen/PIGEON</ext-link></comment></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Prompt and synthetic data generation templates along with training parameters and error analyses.</p><media xlink:href="jmir_v28i1e92413_app1.docx" xlink:title="DOCX File, 188 KB"/></supplementary-material></app-group></back></article>