<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">J Med Internet Res</journal-id><journal-id journal-id-type="publisher-id">jmir</journal-id><journal-id journal-id-type="index">1</journal-id><journal-title>Journal of Medical Internet Research</journal-title><abbrev-journal-title>J Med Internet Res</abbrev-journal-title><issn pub-type="epub">1438-8871</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v28i1e86365</article-id><article-id pub-id-type="doi">10.2196/86365</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Context-Aware Sentence Classification of Radiology Reports Using Synthetic Data: Development and Validation Study</article-title></title-group><contrib-group><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Kikuchi</surname><given-names>Tomohiro</given-names></name><degrees>MPH, MD, PhD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Yamagishi</surname><given-names>Yosuke</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff4">4</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Yamamoto</surname><given-names>Kohei</given-names></name><degrees>MMSc</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Akashi</surname><given-names>Toshiaki</given-names></name><degrees>MD, PhD</degrees><xref ref-type="aff" rid="aff5">5</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Mori</surname><given-names>Harushi</given-names></name><degrees>MD, PhD</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Makimoto</surname><given-names>Hisaki</given-names></name><degrees>MD, PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Kohro</surname><given-names>Takahide</given-names></name><degrees>MD, PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib></contrib-group><aff id="aff1"><institution>Data Science Center, Jichi Medical University</institution><addr-line>Tochigi</addr-line><country>Japan</country></aff><aff id="aff2"><institution>Department of Radiology, Jichi Medical University</institution><addr-line>3311-1, Yakushiji, Shimotsuke</addr-line><addr-line>Tochigi</addr-line><country>Japan</country></aff><aff id="aff3"><institution>Department of Computational Diagnostic Radiology and Preventive Medicine, The University of Tokyo Hospital</institution><addr-line>Tokyo</addr-line><country>Japan</country></aff><aff id="aff4"><institution>Division of Radiology and Biomedical Engineering, Graduate School of Medicine, The University of Tokyo</institution><addr-line>Tokyo</addr-line><country>Japan</country></aff><aff id="aff5"><institution>Department of Radiology, Juntendo University School of Medicine</institution><addr-line>Tokyo</addr-line><country>Japan</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Coristine</surname><given-names>Andrew</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Liu</surname><given-names>Dan</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Zhang</surname><given-names>Jun</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Tomohiro Kikuchi, MPH, MD, PhD, Department of Radiology, Jichi Medical University, 3311-1, Yakushiji, Shimotsuke, Tochigi, 329-0498, Japan, 81 285-58-7362; <email>r1419kt@jichi.ac.jp</email></corresp></author-notes><pub-date pub-type="collection"><year>2026</year></pub-date><pub-date pub-type="epub"><day>10</day><month>4</month><year>2026</year></pub-date><volume>28</volume><elocation-id>e86365</elocation-id><history><date date-type="received"><day>23</day><month>10</month><year>2025</year></date><date date-type="rev-recd"><day>28</day><month>02</month><year>2026</year></date><date date-type="accepted"><day>28</day><month>02</month><year>2026</year></date></history><copyright-statement>&#x00A9; Tomohiro Kikuchi, Yosuke Yamagishi, Kohei Yamamoto, Toshiaki Akashi, Harushi Mori, Hisaki Makimoto, Takahide Kohro. Originally published in the Journal of Medical Internet Research (<ext-link ext-link-type="uri" xlink:href="https://www.jmir.org">https://www.jmir.org</ext-link>), 10.4.2026. </copyright-statement><copyright-year>2026</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in the Journal of Medical Internet Research (ISSN 1438-8871), is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://www.jmir.org/">https://www.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://www.jmir.org/2026/1/e86365"/><abstract><sec><title>Background</title><p>Automated structuring of radiology reports is essential for data utilization and the development of medical artificial intelligence models. However, manual annotation by experts is labor-intensive, and processing real clinical data through commercial large language models (LLMs) presents significant privacy risks. These challenges are particularly pronounced for non-English languages like Japanese, where specialized medical corpora are scarce. While synthetic data generation offers a potential privacy-preserving alternative, its effectiveness in capturing complex clinical nuances&#x2014;such as negation and contextual dependencies&#x2014;to train robust classification models without any real-world training data has not been fully established.</p></sec><sec><title>Objective</title><p>This study aimed to develop a context-aware sentence classification model for Japanese radiology reports using an entirely synthetic training pipeline, thereby eliminating reliance on real-world clinical data during the development phase. Furthermore, we sought to evaluate the generalizability of this approach by validating the model&#x2019;s performance on diverse, multi-institutional, real-world reports.</p></sec><sec sec-type="methods"><title>Methods</title><p>Japanese radiology reports (n=3104) were generated using GPT-4.1 and automatically annotated at the sentence level into 4 categories (background, positive finding, negative finding, and continuation) using GPT-4.1-mini. The synthetic data were partitioned into training (n=2670), validation (n=334), and test (n=100) sets. We fine-tuned several models, including lightweight local LLMs (Qwen3 and Llama 3.2 series) using low-rank adaptation and Japanese text classification models (Bidirectional Encoder Representations from Transformers [BERT]-base Japanese v3, Japanese Medical Robustly Optimized BERT Pretraining Approach [JMedRoBERTa]-base, and ModernBERT-Ja-130M). External validation was performed using 280 real-world reports (3477 sentences) from 7 institutions in the Japan Medical Image Database, with ground-truth labels established by board-certified radiologists. Evaluation metrics included accuracy, macro-averaged <italic>F</italic><sub>1</sub> (macro <italic>F</italic><sub>1</sub>) score, and positive predictive value for positive findings (PPV_1).</p></sec><sec sec-type="results"><title>Results</title><p>All models achieved high performance on the synthetic test set (accuracy: 0.938&#x2010;0.951; macro <italic>F</italic><sub>1</sub>-score: 0.924&#x2010;0.940). Overall performance declined on the external validation dataset (accuracy: 0.783&#x2010;0.813; macro <italic>F</italic><sub>1</sub>-score: 0.761&#x2010;0.790), reflecting distributional differences between synthetic and real-world reports; however, PPV_1 remained stable and high across datasets (eg, 0.957 on the synthetic test set vs 0.952 on the external validation dataset for Qwen3 [4B]). Parsing errors occurred in LLM-based approaches (19&#x2010;260 sentences, 0.55%&#x2010;7.48% in the external dataset).</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>This study demonstrates the feasibility of developing context-aware sentence classification models for Japanese radiology reports using a training pipeline based entirely on synthetic data. The stability of PPV_1 indicates that the models successfully captured the essential clinical terminology and linguistic patterns required to identify positive findings in real-world reports, despite the observed performance degradation during external validation. This approach substantially reduces manual annotation requirements and privacy risks, providing a scalable foundation for constructing structured radiology datasets to support the development of clinically relevant medical artificial intelligence models.</p></sec></abstract><kwd-group><kwd>natural language processing</kwd><kwd>artificial intelligence</kwd><kwd>large language models</kwd><kwd>data annotation</kwd><kwd>radiology report</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>In recent years, vision-language models (VLMs) have been increasingly applied to medical image analysis [<xref ref-type="bibr" rid="ref1">1</xref>,<xref ref-type="bibr" rid="ref2">2</xref>]. VLMs are multimodal architectures that commonly integrate large language models (LLMs) for textual processing with visual encoders for image representation, enabling tasks such as image captioning, visual question answering, and image-text matching [<xref ref-type="bibr" rid="ref3">3</xref>]. In the medical domain, VLMs are typically trained on paired image-text data to associate imaging findings with their corresponding textual descriptions, supporting applications including automated report generation, cross-modal retrieval, and clinical decision support [<xref ref-type="bibr" rid="ref4">4</xref>-<xref ref-type="bibr" rid="ref6">6</xref>]. Training these models requires large-scale image-text pairs, and both the quality and quantity of such pairs critically affect the performance of the model [<xref ref-type="bibr" rid="ref7">7</xref>,<xref ref-type="bibr" rid="ref8">8</xref>].</p><p>Several large-scale, English-language medical image-text datasets, such as the Medical Information Mart for Intensive Care Chest X-Ray (MIMIC-CXR) and CT-RATE, have recently been released [<xref ref-type="bibr" rid="ref9">9</xref>-<xref ref-type="bibr" rid="ref11">11</xref>]. These resources have become foundational for training VLMs. Consequently, many state-of-the-art VLMs are predominantly trained and evaluated on English data, largely due to the sheer volume of available resources. When applied to other languages, these models often exhibit performance degradation due to linguistic disparities, such as variations in negation, uncertainty expressions, and reporting conventions across institutions [<xref ref-type="bibr" rid="ref12">12</xref>]. This is especially true for languages such as Japanese; while they are well-supported in general-domain natural language processing (NLP), the scarcity of public radiology-specific corpora creates a low-resource environment in the medical domain [<xref ref-type="bibr" rid="ref13">13</xref>,<xref ref-type="bibr" rid="ref14">14</xref>]. Although some non-English resources such as PadChest exist, they remain limited in scale or scope [<xref ref-type="bibr" rid="ref15">15</xref>,<xref ref-type="bibr" rid="ref16">16</xref>]. Simple cross-lingual transfer or translation-based approaches may fail to capture language-specific clinical nuances and local disease prevalence. Furthermore, many publicly available datasets are biased toward prototypical or specific disease cases and do not adequately reflect the diversity of findings and contexts encountered in daily clinical practice [<xref ref-type="bibr" rid="ref17">17</xref>].</p><p>To develop VLMs that can be used in real-world radiology workflows across different countries, it is essential to establish and continuously curate language-specific datasets that reflect local clinical practices. However, radiology reports are typically unstructured free text, exhibiting significant variations in structure and style across institutions and radiologists. They often contain diverse sentence types, including background information, positive findings, negative findings, and continuation from previous sentences. Consequently, simple sentence segmentation is inadequate because sentence boundaries often do not align with thematic transitions, and reports frequently include content that is irrelevant to the actual image findings. Therefore, constructing automated image-text pairs requires text processing&#x2014;specifically, context-aware sentence classification&#x2014;as a preliminary step before alignment. While commercial LLMs have shown success in radiology tasks, developing a pipeline that transmits clinical reports to external services is often unacceptable due to ethical, regulatory, and privacy concerns [<xref ref-type="bibr" rid="ref13">13</xref>,<xref ref-type="bibr" rid="ref18">18</xref>,<xref ref-type="bibr" rid="ref19">19</xref>]. Beyond these, economic and technical factors pose substantial challenges; high cumulative application programming interface costs and nontransparent model updates hinder large-scale data curation and compromise reproducibility [<xref ref-type="bibr" rid="ref20">20</xref>,<xref ref-type="bibr" rid="ref21">21</xref>]. Consequently, local models are necessary; however, manual annotation for in-house training data is extremely labor-intensive [<xref ref-type="bibr" rid="ref22">22</xref>,<xref ref-type="bibr" rid="ref23">23</xref>].</p><p>To address these challenges, we leveraged a commercial LLM solely to generate synthetic Japanese radiology reports and perform automated sentence-level annotation, eliminating the need for labor-intensive manual annotation. This synthetic dataset was used to fine-tune lightweight local LLMs, enabling efficient model development while strictly safeguarding patient privacy by avoiding the use of actual clinical data. This study aimed to develop a context-aware sentence classification model for segmenting Japanese radiology reports into finding-level textual units that are suitable for pairing with images for VLM training. We investigated the feasibility of executing the entire training pipeline exclusively using synthetic data, with validation performed on multi-institutional, real-world datasets.</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Ethical Considerations</title><p>This retrospective human-subject study was approved by the institutional review board of Jichi Medical University Hospital (approval J24-017; approval date: June 10, 2024). The requirement for informed consent was waived by the institutional review board because this study involved a secondary analysis of existing clinical data. All data were anonymized or deidentified prior to analysis in accordance with institutional data governance policies, and no direct personal identifiers were accessible to the research team at any point during the study. No compensation was provided to participants. All figures and images included in this paper were nonidentifiable, and no additional consent for image publication was required.</p></sec><sec id="s2-2"><title>Data Synthesis and Annotation (Training Data)</title><p>We generated synthetic Japanese radiology reports using the OpenAI application programming interface and automatically annotated each sentence into 4 predefined categories (<xref ref-type="fig" rid="figure1">Figure 1</xref>). Initially, we attempted to generate 3200 reports using GPT-4.1. To ensure the generation of diverse reports, we explicitly defined various attributes and their candidate values&#x2014;including radiologist experience level, reporting style and structural preferences, patient demographics, anatomical site, reason for examination, image quality and coverage scope, presence of prior examinations, disease category, diagnostic certainty, and disease rarity&#x2014;and randomly combined and selected them for each report (Section A in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>).</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Flowchart of synthetic data generation. Prompt 1 was used for report generation (Section A in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>), and Prompt 2 (system prompt) was used for report annotation (Section B in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). Synthetic radiology reports were generated using the OpenAI API (GPT-4.1), and these were annotated using the OpenAI API (GPT-4.1-mini). Reports were excluded based solely on structural validity (eg, parsing errors) without semantic filtering. The final dataset was partitioned into training (n=2670), validation (n=334), and test (n=100) sets. API: application programming interface.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v28i1e86365_fig01.png"/></fig><p>Subsequently, these reports were automatically annotated using GPT-4.1-mini, where sentences were assigned to one of the following 4 categories: label 0 (background) for nonfinding information within the findings section, including clinical history, prior study comparisons, technical assessments, and demographics; label 1 (positive finding) to denote the presence of abnormalities; label 2 (negative finding) to denote the absence of abnormalities, with label 1 taking precedence if both coexist in a single sentence; and label 3 (continuation) for sentences that provide supplementary details to the preceding sentence without constituting a standalone finding. Representative examples for each label are provided in <xref ref-type="table" rid="table1">Table 1</xref>, and the system prompt used for automated annotation is detailed in Section B in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p><p>After excluding reports with unintended formats, including parsing errors (failure to produce a valid JSON structure), the final dataset comprised 2670 training data, 334 validation data, and 100 test reports. For the synthetic test set, sentence-level annotation into the 4 predefined categories was manually performed by 2 board-certified radiologists, and a consensus was reached for each sentence.</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Definitions of sentence labels and examples.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Label</td><td align="left" valign="bottom">Definitions</td><td align="left" valign="bottom">Examples</td></tr></thead><tbody><tr><td align="left" valign="top">0=Background</td><td align="left" valign="top">Background information not related to current imaging findings</td><td align="left" valign="top"><list list-type="bullet"><list-item><p>&#x201C;Patient has a history of right lung cancer surgery.&#x201D;</p></list-item><list-item><p>&#x201C;Comparison with previous CT<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup> from October 2024.&#x201D;</p></list-item></list></td></tr><tr><td align="left" valign="top">1=Positive finding</td><td align="left" valign="top">Description of a distinct positive finding</td><td align="left" valign="top"><list list-type="bullet"><list-item><p>&#x201C;A 2-cm hypervascular lesion is observed in Segment 8 of the liver.&#x201D;</p></list-item><list-item><p>&#x201D;Mild pleural effusion is present on the right side.&#x201D;</p></list-item></list></td></tr><tr><td align="left" valign="top">2=Negative finding</td><td align="left" valign="top">Description of a distinct negative finding (absence of abnormalities)</td><td align="left" valign="top"><list list-type="bullet"><list-item><p>&#x201C;No significant lymphadenopathy is noted.&#x201D;</p></list-item><list-item><p>&#x201C;There are no signs of intracranial hemorrhage.&#x201D;</p></list-item></list></td></tr><tr><td align="left" valign="top">3=Continuation</td><td align="left" valign="top">Sentences that modify or describe the preceding finding and do not stand alone.</td><td align="left" valign="top"><list list-type="bullet"><list-item><p>&#x201C;It shows gradual washout on the delayed phase.&#x201D;</p></list-item><list-item><p>&#x201C;This lesion appears stable compared to the previous exam.&#x201D;</p></list-item></list></td></tr></tbody></table><table-wrap-foot><fn id="table1fn1"><p><sup>a</sup>CT: computed tomography.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s2-3"><title>External Validation Dataset</title><p>To evaluate whether models trained solely on synthetic data generalize to real-world clinical practice, we constructed a multi-institutional external validation dataset using reports from the Japan Medical Image Database (J-MID) [<xref ref-type="bibr" rid="ref24">24</xref>], a nationwide repository of real-world radiological images and reports collected from multiple Japanese institutions. Instead of selecting curated or disease-specific cases, we adopted a snapshot sampling strategy to reflect routine clinical practice. Specifically, we randomly sampled 40 computed tomography reports from each of the 7 participating institutions within the J-MID, resulting in a total of 280 reports dated October 1, 2024. The dataset demonstrated varying imaging coverage&#x2014;torso (n=141, 50.4%), chest (n=64, 22.9%), abdomen/pelvis (n=20, 7.1%), head (n=19, 6.8%), whole body (n=12, 4.3%), neck (n=8, 2.9%), and others (n=16, 5.7%)&#x2014;including both noncontrast (n=176, 62.9%) and contrast-enhanced (n=104, 37.1%) studies. For text preprocessing, reports were segmented into sentences using line breaks and periods (&#x201C;.&#x201D;). Sentence-level annotation into the 4 predefined categories was manually performed by 2 board-certified radiologists, and a consensus was reached for each sentence.</p></sec><sec id="s2-4"><title>Model Selection</title><p>We used both LLMs and Japanese text classification models to construct and compare sentence classification models. To ensure that the training could be handled on a single NVIDIA RTX6000 Ada graphics processing unit (GPU) (48 GB memory), we limited the model size to approximately 4 billion parameters (4B). For LLMs, we fine-tuned Qwen3 (0.6B, 1.7B, 4B) and Llama 3.2 (1B, 3B) using low-rank adaptation (LoRA) [<xref ref-type="bibr" rid="ref25">25</xref>,<xref ref-type="bibr" rid="ref26">26</xref>]. For text classification, we selected 3 Japanese models based on the BERT (Bidirectional Encoder Representations from Transformers), RoBERTa (Robustly Optimized BERT Pretraining Approach), and ModernBERT architectures [<xref ref-type="bibr" rid="ref27">27</xref>-<xref ref-type="bibr" rid="ref29">29</xref>]. First, BERT base Japanese v3 was included as a widely used model in Japanese general and medical NLP tasks, including radiology-focused applications [<xref ref-type="bibr" rid="ref30">30</xref>,<xref ref-type="bibr" rid="ref31">31</xref>]. Second, JMedRoBERTa [Japanese Medical RoBERTa]-base was utilized because this model is pretrained on large-scale Japanese medical literature [<xref ref-type="bibr" rid="ref32">32</xref>]. Third, ModernBERT-Ja-130M was selected as a recently proposed architecture that achieves high performance while maintaining strong computational efficiency [<xref ref-type="bibr" rid="ref33">33</xref>]. Links to the model cards for all models used in this study are provided in Section C in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p></sec><sec id="s2-5"><title>Model Training and Inference</title><p>To fine-tune the LLM, we adopted the LoRA framework [<xref ref-type="bibr" rid="ref34">34</xref>]. The configuration was as follows: rank (r)=16, LoRA <italic>&#x03B1;</italic>=32, dropout=0.05, and target modules={q_proj, k_proj, v_proj, o_proj, gate_proj, up_proj, down_proj}. Training was performed for 3 epochs with a batch size of 2 and gradient accumulation of 8 (effective batch size=16). The learning rate was set to 1&#x00D7;10&#x207B;&#x2074; with a cosine scheduler, including 100 warm-up steps. The maximum sequence length was 2048 tokens. We applied 4-bit quantization using quantized LoRA (QLoRA) with 16-bit mixed-precision for memory efficiency [<xref ref-type="bibr" rid="ref35">35</xref>]. In this setting, the entire text was provided as input, and the model was trained for a text-generation task to produce structured outputs in JSON format, utilizing the system prompt detailed in Section B in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>. The outputs were subsequently parsed into sentence-level predictions.</p><p>For the text classification models, fine-tuning was performed using the HuggingFace Trainer with a maximum sequence length of 512 tokens, a batch size of 16, a learning rate of 2&#x00D7;10&#x207B;&#x2075;, and 3 training epochs. Mixed precision training with 16-bit mixed precision was applied, and the best checkpoint was selected according to the macro-averaged <italic>F</italic><sub>1</sub> (macro <italic>F</italic><sub>1</sub>) score of the validation set. To incorporate contextual information, we used a sentence-centered local context window consisting of the target sentence and its 2 preceding and 2 following sentences. This approach ensures the inclusion of sufficient local context while remaining within the 512-token limit typical of BERT-based models. During inference, models were applied to individual reports in a sentence-wise manner using the same context window configuration as in training.</p></sec><sec id="s2-6"><title>Evaluation Metrics</title><p>To provide a rigorous assessment of both the automated annotations and the fine-tuned models, we defined the following standard classification metrics. Classification accuracy was calculated as the ratio of correctly predicted labels to the total number of sentences.</p><disp-formula id="equWL1">.<mml:math id="eqn1"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mi mathvariant="normal">A</mml:mi><mml:mi mathvariant="normal">c</mml:mi><mml:mi mathvariant="normal">c</mml:mi><mml:mi mathvariant="normal">u</mml:mi><mml:mi mathvariant="normal">r</mml:mi><mml:mi mathvariant="normal">a</mml:mi><mml:mi mathvariant="normal">c</mml:mi><mml:mi mathvariant="normal">y</mml:mi></mml:mrow><mml:mtext>&#x00A0;</mml:mtext><mml:mo>=</mml:mo><mml:mtext>&#x00A0;</mml:mtext><mml:mfrac><mml:mrow><mml:mrow><mml:mi mathvariant="normal">N</mml:mi><mml:mi mathvariant="normal">u</mml:mi><mml:mi mathvariant="normal">m</mml:mi><mml:mi mathvariant="normal">b</mml:mi><mml:mi mathvariant="normal">e</mml:mi><mml:mi mathvariant="normal">r</mml:mi></mml:mrow><mml:mtext>&#x00A0;</mml:mtext><mml:mrow><mml:mi mathvariant="normal">o</mml:mi><mml:mi mathvariant="normal">f</mml:mi></mml:mrow><mml:mtext>&#x00A0;</mml:mtext><mml:mrow><mml:mi mathvariant="normal">c</mml:mi><mml:mi mathvariant="normal">o</mml:mi><mml:mi mathvariant="normal">r</mml:mi><mml:mi mathvariant="normal">r</mml:mi><mml:mi mathvariant="normal">e</mml:mi><mml:mi mathvariant="normal">c</mml:mi><mml:mi mathvariant="normal">t</mml:mi></mml:mrow><mml:mtext>&#x00A0;</mml:mtext><mml:mrow><mml:mi mathvariant="normal">p</mml:mi><mml:mi mathvariant="normal">r</mml:mi><mml:mi mathvariant="normal">e</mml:mi><mml:mi mathvariant="normal">d</mml:mi><mml:mi mathvariant="normal">i</mml:mi><mml:mi mathvariant="normal">c</mml:mi><mml:mi mathvariant="normal">t</mml:mi><mml:mi mathvariant="normal">i</mml:mi><mml:mi mathvariant="normal">o</mml:mi><mml:mi mathvariant="normal">n</mml:mi><mml:mi mathvariant="normal">s</mml:mi></mml:mrow></mml:mrow><mml:mrow><mml:mrow><mml:mi mathvariant="normal">T</mml:mi><mml:mi mathvariant="normal">o</mml:mi><mml:mi mathvariant="normal">t</mml:mi><mml:mi mathvariant="normal">a</mml:mi><mml:mi mathvariant="normal">l</mml:mi></mml:mrow><mml:mtext>&#x00A0;</mml:mtext><mml:mrow><mml:mi mathvariant="normal">n</mml:mi><mml:mi mathvariant="normal">u</mml:mi><mml:mi mathvariant="normal">m</mml:mi><mml:mi mathvariant="normal">b</mml:mi><mml:mi mathvariant="normal">e</mml:mi><mml:mi mathvariant="normal">r</mml:mi></mml:mrow><mml:mtext>&#x00A0;</mml:mtext><mml:mrow><mml:mi mathvariant="normal">o</mml:mi><mml:mi mathvariant="normal">f</mml:mi></mml:mrow><mml:mtext>&#x00A0;</mml:mtext><mml:mrow><mml:mi mathvariant="normal">s</mml:mi><mml:mi mathvariant="normal">e</mml:mi><mml:mi mathvariant="normal">n</mml:mi><mml:mi mathvariant="normal">t</mml:mi><mml:mi mathvariant="normal">e</mml:mi><mml:mi mathvariant="normal">n</mml:mi><mml:mi mathvariant="normal">c</mml:mi><mml:mi mathvariant="normal">e</mml:mi><mml:mi mathvariant="normal">s</mml:mi></mml:mrow></mml:mrow></mml:mfrac></mml:mstyle></mml:mrow></mml:mstyle></mml:math></disp-formula><p>To ensure a balanced evaluation under class imbalance, we additionally adopted the macro <italic>F</italic><sub>1</sub>-score. For each class <inline-formula><mml:math id="ieqn1"><mml:mi>i</mml:mi><mml:mi> </mml:mi><mml:mo>(</mml:mo><mml:mi> </mml:mi><mml:mi>i</mml:mi><mml:mi> </mml:mi><mml:mo>&#x2208;</mml:mo><mml:mi> </mml:mi><mml:mo>{</mml:mo><mml:mn>0</mml:mn><mml:mo>,</mml:mo><mml:mi> </mml:mi><mml:mn>1</mml:mn><mml:mo>,</mml:mo><mml:mi> </mml:mi><mml:mn>2</mml:mn><mml:mo>,</mml:mo><mml:mi> </mml:mi><mml:mn>3</mml:mn><mml:mo>}</mml:mo><mml:mo>)</mml:mo></mml:math></inline-formula> (0=background, 1=positive finding, 2=negative finding, and 3=continuation), precision <inline-formula><mml:math id="ieqn2"><mml:mo>(</mml:mo><mml:msub><mml:mrow><mml:mi>P</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>)</mml:mo><mml:mi> </mml:mi></mml:math></inline-formula>, recall <inline-formula><mml:math id="ieqn3"><mml:mo>(</mml:mo><mml:msub><mml:mrow><mml:mi>R</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>)</mml:mo></mml:math></inline-formula>, and <italic>F</italic><sub>1</sub>-score <inline-formula><mml:math id="ieqn4"><mml:mo>(</mml:mo><mml:msub><mml:mrow><mml:mi>F</mml:mi><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>)</mml:mo></mml:math></inline-formula> were defined as follows:</p><disp-formula id="equWL2"><mml:math id="eqn2"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mstyle displaystyle="true" scriptlevel="0"><mml:msub><mml:mi>P</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mtext>&#x00A0;</mml:mtext><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mi>T</mml:mi><mml:msub><mml:mi>P</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mrow><mml:mi>T</mml:mi><mml:msub><mml:mi>P</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mtext>&#x00A0;</mml:mtext><mml:mo>+</mml:mo><mml:mtext>&#x00A0;</mml:mtext><mml:mi>F</mml:mi><mml:msub><mml:mi>P</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:mfrac><mml:mtext>&#x00A0;</mml:mtext><mml:mo>,</mml:mo><mml:msub><mml:mi>R</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mtext>&#x00A0;</mml:mtext><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mi>T</mml:mi><mml:msub><mml:mi>P</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mrow><mml:mi>T</mml:mi><mml:msub><mml:mi>P</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mtext>&#x00A0;</mml:mtext><mml:mo>+</mml:mo><mml:mtext>&#x00A0;</mml:mtext><mml:mi>F</mml:mi><mml:msub><mml:mi>N</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:mfrac><mml:mtext>&#x00A0;</mml:mtext><mml:mo>,</mml:mo><mml:mtext>&#x00A0;</mml:mtext><mml:mi>F</mml:mi><mml:msub><mml:mn>1</mml:mn><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mtext>&#x00A0;</mml:mtext><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mn>2</mml:mn><mml:mtext>&#x00A0;</mml:mtext><mml:mo>&#x00D7;</mml:mo><mml:mtext>&#x00A0;</mml:mtext><mml:msub><mml:mi>P</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mtext>&#x00A0;</mml:mtext><mml:mo>&#x00D7;</mml:mo><mml:mtext>&#x00A0;</mml:mtext><mml:msub><mml:mi>R</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mrow><mml:msub><mml:mi>P</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mtext>&#x00A0;</mml:mtext><mml:mo>+</mml:mo><mml:mtext>&#x00A0;</mml:mtext><mml:msub><mml:mi>R</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:mfrac></mml:mstyle></mml:mrow></mml:mstyle></mml:math></disp-formula><p>The macro <italic>F</italic><sub>1</sub>-score was calculated as the arithmetic mean of these class-specific scores:</p><disp-formula id="equWL3"><mml:math id="eqn3"><mml:mi>M</mml:mi><mml:mi>a</mml:mi><mml:mi>c</mml:mi><mml:mi>r</mml:mi><mml:mi>o</mml:mi><mml:mi> </mml:mi><mml:mi>F</mml:mi><mml:mn>1</mml:mn><mml:mi> </mml:mi><mml:mo>=</mml:mo><mml:mi> </mml:mi><mml:mfrac><mml:mrow><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mn>4</mml:mn></mml:mrow></mml:mfrac><mml:mrow><mml:msubsup><mml:mo stretchy="false">&#x2211;</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>0</mml:mn></mml:mrow><mml:mrow><mml:mn>3</mml:mn></mml:mrow></mml:msubsup><mml:mrow><mml:msub><mml:mrow><mml:mi>F</mml:mi><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:mrow></mml:math></disp-formula><p>In addition, we reported the positive predictive value for label 1 (PPV_1), corresponding to <inline-formula><mml:math id="ieqn5"><mml:msub><mml:mrow><mml:mi>P</mml:mi></mml:mrow><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub></mml:math></inline-formula> in this framework. This metric was included because label 1 represents positive imaging findings, and high precision for this class is essential for ensuring the reliability of extracted image-text pairs. We also reported the recall for label 1 (Recall_1) and class-specific <italic>F</italic><sub>1</sub>-score for label 1 (F1_1) to evaluate dataset coverage and the overall precision-recall balance, corresponding to <inline-formula><mml:math id="ieqn6"><mml:msub><mml:mrow><mml:mi>R</mml:mi></mml:mrow><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub></mml:math></inline-formula> and <inline-formula><mml:math id="ieqn7"><mml:mstyle><mml:mrow><mml:mstyle displaystyle="false"><mml:mi>F</mml:mi><mml:mn>1</mml:mn><mml:mi mathvariant="normal">_</mml:mi><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:mstyle></mml:mrow></mml:mstyle></mml:math></inline-formula>, respectively, in the above framework.</p><p>First, to validate the quality of the automated sentence-level annotations generated by GPT-4.1-mini, we compared them against the radiologist consensus on the synthetic test set. Agreement was quantified using classification accuracy, macro <italic>F</italic><sub>1</sub>-score, and Cohen &#x03BA; coefficient. Second, for the LLM outputs, we quantified the incidence of parsing errors and aggregated these error rates by facility. Third, the performance of all models was evaluated on both the synthetic test set and external validation datasets using accuracy, macro <italic>F</italic><sub>1</sub>-score, PPV_1, Recall_1, and F1_1. For LLM-based approaches, sentences associated with parsing errors were excluded from subsequent performance analyses. Fourth, we performed a targeted error analysis. We reviewed 40 reports from the institution with the highest parsing error rate to identify recurring patterns. Additionally, we examined the best-performing model&#x2019;s errors by extracting false-positive label 1 predictions to identify the primary reasons for these misclassifications.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><p><xref ref-type="table" rid="table2">Table 2</xref> summarizes the distribution of labels in the datasets used in this study. In the synthetic dataset, label 2 was the most frequent category at 44%, whereas label 1 predominated in the external validation dataset at 42.6%. Synthetic reports exhibited higher verbosity with a mean (SD) character count of 359.1 (SD 138.9) compared to 224.4 (SD 125.9) in the external dataset, despite having similar mean sentence counts (11.2, SD 4.0 vs 12.4, SD 6.3). Substantial variability was observed across external institutions.</p><p><xref ref-type="fig" rid="figure2">Figure 2</xref> presents the confusion matrix for the annotation performance of GPT-4.1-mini compared with the radiologist consensus on the synthetic test set. The classification accuracy, macro <italic>F</italic><sub>1</sub>-score, and Cohen &#x03BA; were 0.942, 0.929, and 0.917, respectively.</p><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Distribution of the dataset.</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Dataset</td><td align="left" valign="bottom">Label 0<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup>, n (%)</td><td align="left" valign="bottom">Label 1<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup>, n (%)</td><td align="left" valign="bottom">Label 2<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup>, n (%)</td><td align="left" valign="bottom">Label 3<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup>, n (%)</td><td align="left" valign="bottom">Characters/report, mean (SD)</td><td align="left" valign="bottom">Sentences/report, mean (SD)</td></tr></thead><tbody><tr><td align="left" valign="top" colspan="7">Synthetic dataset (3104 reports)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Total</td><td align="left" valign="top">6593 (18.9)</td><td align="left" valign="top">7673 (22.0)</td><td align="left" valign="top">15,327 (44.0)</td><td align="left" valign="top">5257 (15.1)</td><td align="left" valign="top">359.1 (138.9)</td><td align="left" valign="top">11.2 (4.0)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Train</td><td align="left" valign="top">5680 (18.9)</td><td align="left" valign="top">6620 (22.1)</td><td align="left" valign="top">13,187 (44.0)</td><td align="left" valign="top">4503 (15.0)</td><td align="left" valign="top">359.7 (138.5)</td><td align="left" valign="top">11.2 (4.0)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Validation</td><td align="left" valign="top">699 (18.7)</td><td align="left" valign="top">828 (22.2)</td><td align="left" valign="top">1629 (43.6)</td><td align="left" valign="top">580 (15.5)</td><td align="left" valign="top">360.1 (141.6)</td><td align="left" valign="top">11.2 (3.9)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Test</td><td align="left" valign="top">214 (19.0)</td><td align="left" valign="top">225 (20.0)</td><td align="left" valign="top">511 (45.5)</td><td align="left" valign="top">174 (15.5)</td><td align="left" valign="top">338.6 (139.2)</td><td align="left" valign="top">11.2 (4.3)</td></tr><tr><td align="left" valign="top" colspan="7">External validation dataset (280 reports)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Total</td><td align="left" valign="top">531 (15.3)</td><td align="left" valign="top">1480 (42.6)</td><td align="left" valign="top">1001 (28.8)</td><td align="left" valign="top">465 (13.4)</td><td align="left" valign="top">224.4 (125.9)</td><td align="left" valign="top">12.4 (6.3)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Institution A</td><td align="left" valign="top">64 (11.6)</td><td align="left" valign="top">233 (42.2)</td><td align="left" valign="top">196 (35.5)</td><td align="left" valign="top">59 (10.7)</td><td align="left" valign="top">304.7 (127.4)</td><td align="left" valign="top">13.8 (6.1)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Institution B</td><td align="left" valign="top">68 (13.9)</td><td align="left" valign="top">200 (40.9)</td><td align="left" valign="top">182 (37.2)</td><td align="left" valign="top">39 (8.0)</td><td align="left" valign="top">202.5 (73.2)</td><td align="left" valign="top">12.2 (4.4)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Institution C</td><td align="left" valign="top">58 (20.4)</td><td align="left" valign="top">122 (42.8)</td><td align="left" valign="top">58 (20.4)</td><td align="left" valign="top">47 (16.5)</td><td align="left" valign="top">148.3 (87.1)</td><td align="left" valign="top">7.1 (3.7)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Institution D</td><td align="left" valign="top">77 (18.3)</td><td align="left" valign="top">150 (35.6)</td><td align="left" valign="top">132 (31.4)</td><td align="left" valign="top">62 (14.7)</td><td align="left" valign="top">135.2 (70.0)</td><td align="left" valign="top">10.5 (5.3)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Institution E</td><td align="left" valign="top">75 (15.5)</td><td align="left" valign="top">201 (41.5)</td><td align="left" valign="top">166 (34.3)</td><td align="left" valign="top">42 (8.7)</td><td align="left" valign="top">186.6 (81.8)</td><td align="left" valign="top">12.1 (4.9)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Institution F</td><td align="left" valign="top">89 (12.4)</td><td align="left" valign="top">337 (47.1)</td><td align="left" valign="top">181 (25.3)</td><td align="left" valign="top">108 (15.1)</td><td align="left" valign="top">339.3 (138.3)</td><td align="left" valign="top">17.9 (7.6)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Institution G</td><td align="left" valign="top">100 (18.8)</td><td align="left" valign="top">237 (44.6)</td><td align="left" valign="top">86 (16.2)</td><td align="left" valign="top">108 (20.3)</td><td align="left" valign="top">254.0 (128.2)</td><td align="left" valign="top">13.3 (6.3)</td></tr></tbody></table><table-wrap-foot><fn id="table2fn1"><p><sup>a</sup>Labels represent the following sentence categories: 0=background, 1=positive findings, 2=negative findings, and 3=continuation. Values in parentheses represent percentages of the row total.</p></fn></table-wrap-foot></table-wrap><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>Confusion matrix of GPT-4.1-mini versus radiologist consensus on the synthetic test set. Labels represent the following categories: 0=background, 1=positive findings, 2=negative findings, and 3=continuation. Per-class precision/recall: label 0=0.946/0.977; label 1=0.931/0.956; label 2=0.982/0.953; and label 3=0.841/0.851.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v28i1e86365_fig02.png"/></fig><p><xref ref-type="table" rid="table3">Table 3</xref> presents the performance of the models on the internal test set, which consisted of 1124 sentences. Parsing errors were observed only in Qwen3 (0.6B) (3/1124 sentences, 0.27%), and no errors were observed in the other models. Accuracy ranged from 0.938 to 0.951; macro <italic>F</italic><sub>1</sub>-scores ranged from 0.924 to 0.940; PPV_1 ranged from 0.927 to 0.957; Recall_1 ranged from 0.898 to 0.960; and F1_1 ranged from 0.921 to 0.951.</p><p><xref ref-type="table" rid="table4">Table 4</xref> shows the results on the external validation dataset, which comprised 3477 sentences from 280 reports. Parsing errors were observed only in the LLM group, ranging from 19 to 260 sentences (0.55%&#x2010;7.48%), whereas no errors were observed in the text classification models. Accuracy ranged from 0.783 to 0.813; macro <italic>F</italic><sub>1</sub>-score ranged from 0.761 to 0.790; PPV_1 ranged from 0.897 to 0.952; Recall_1 ranged from 0.699 to 0.733; and F1_1 ranged from 0.794 to 0.821. Among all the models, Qwen3 (4B) demonstrated the best overall performance, maintaining high accuracy, Macro <italic>F</italic><sub>1</sub>-score, and PPV_1.</p><table-wrap id="t3" position="float"><label>Table 3.</label><caption><p>Results on the internal synthetic test set<sup><xref ref-type="table-fn" rid="table3fn1">a</xref></sup>.</p></caption><table id="table3" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Model</td><td align="left" valign="bottom">Parsing errors<sup><xref ref-type="table-fn" rid="table3fn2">b</xref></sup><break/>(n, % of sentences)</td><td align="left" valign="bottom">Accuracy (95% CI)</td><td align="left" valign="bottom">Macro <italic>F</italic><sub>1</sub><sup><xref ref-type="table-fn" rid="table3fn3">c</xref></sup> (95% CI)</td><td align="left" valign="bottom">PPV<sup><xref ref-type="table-fn" rid="table3fn4">d</xref></sup> for label 1 (95% CI)</td><td align="left" valign="bottom">Recall for label 1 (95% CI)</td><td align="left" valign="bottom">F1 for label 1 (95% CI)</td></tr></thead><tbody><tr><td align="left" valign="top" colspan="7">LLMs<sup><xref ref-type="table-fn" rid="table3fn5">e</xref></sup> (name, parameters)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Qwen3 (0.6B)</td><td align="left" valign="top">3 (0.27)</td><td align="left" valign="top">0.942 (0.930&#x2010;0.955)</td><td align="left" valign="top">0.929 (0.913&#x2010;0.944)</td><td align="left" valign="top">0.954 (0.926&#x2010;0.978)</td><td align="left" valign="top">0.906 (0.870&#x2010;0.943)</td><td align="left" valign="top">0.929 (0.906&#x2010;0.951)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Qwen3 (1.7B)</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">0.951 (0.937&#x2010;0.963)</td><td align="left" valign="top">0.940 (0.923&#x2010;0.955)</td><td align="left" valign="top">0.941 (0.910&#x2010;0.970)</td><td align="left" valign="top">0.915 (0.873&#x2010;0.947)</td><td align="left" valign="top">0.928 (0.904&#x2010;0.948)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Qwen3 (4B)</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">0.950 (0.937&#x2010;0.962)</td><td align="left" valign="top">0.939 (0.922&#x2010;0.954)</td><td align="left" valign="top">0.957 (0.930&#x2010;0.981)</td><td align="left" valign="top">0.898 (0.860&#x2010;0.941)</td><td align="left" valign="top">0.927 (0.903&#x2010;0.950)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Llama 3.2 (1B)</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">0.949 (0.938&#x2010;0.961)</td><td align="left" valign="top">0.940 (0.926&#x2010;0.955)</td><td align="left" valign="top">0.949 (0.923&#x2010;0.976)</td><td align="left" valign="top">0.911 (0.871&#x2010;0.949)</td><td align="left" valign="top">0.930 (0.908&#x2010;0.952)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Llama 3.2 (3B)</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">0.942 (0.928&#x2010;0.956)</td><td align="left" valign="top">0.927 (0.908&#x2010;0.944)</td><td align="left" valign="top">0.927 (0.889&#x2010;0.960)</td><td align="left" valign="top">0.915 (0.884&#x2010;0.948)</td><td align="left" valign="top">0.921 (0.897&#x2010;0.945)</td></tr><tr><td align="left" valign="top" colspan="7">Text classification models</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>BERT<sup><xref ref-type="table-fn" rid="table3fn6">f</xref></sup> base Japanese v3</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">0.944 (0.931&#x2010;0.956)</td><td align="left" valign="top">0.931 (0.914&#x2010;0.946)</td><td align="left" valign="top">0.939 (0.907&#x2010;0.969)</td><td align="left" valign="top">0.960 (0.931&#x2010;0.986)</td><td align="left" valign="top">0.949 (0.930&#x2010;0.968)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>JMedRoBERTa<sup><xref ref-type="table-fn" rid="table3fn7">g</xref></sup>-base</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">0.938 (0.924&#x2010;0.952)</td><td align="left" valign="top">0.924 (0.905&#x2010;0.940)</td><td align="left" valign="top">0.941 (0.910&#x2010;0.969)</td><td align="left" valign="top">0.925 (0.886&#x2010;0.959)</td><td align="left" valign="top">0.933 (0.909&#x2010;0.956)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>ModernBERT-Ja-130M</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">0.948 (0.936&#x2010;0.959)</td><td align="left" valign="top">0.933 (0.917&#x2010;0.948)</td><td align="left" valign="top">0.951 (0.925&#x2010;0.973)</td><td align="left" valign="top">0.951 (0.921&#x2010;0.977)</td><td align="left" valign="top">0.951 (0.934&#x2010;0.968)</td></tr></tbody></table><table-wrap-foot><fn id="table3fn1"><p><sup>a</sup>95% CIs were estimated using the bootstrap method.</p></fn><fn id="table3fn2"><p><sup>b</sup>Parsing errors: failures in JSON parsing during output generation.</p></fn><fn id="table3fn3"><p><sup>c</sup>Macro <italic>F</italic><sub>1</sub>: macro-averaged <italic>F</italic><sub>1</sub>.</p></fn><fn id="table3fn4"><p><sup>d</sup>PPV: positive predictive value.</p></fn><fn id="table3fn5"><p><sup>e</sup>LLM: large language model.</p></fn><fn id="table3fn6"><p><sup>f</sup>BERT: Bidirectional Encoder Representations from Transformers.</p></fn><fn id="table3fn7"><p><sup>g</sup>JMedRoBERTa: Japanese Medical Robustly Optimized BERT Pretraining Approach.</p></fn></table-wrap-foot></table-wrap><table-wrap id="t4" position="float"><label>Table 4.</label><caption><p>Results on the external validation dataset<sup><xref ref-type="table-fn" rid="table4fn1">a</xref></sup>.</p></caption><table id="table4" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Model</td><td align="left" valign="bottom">Parsing errors<sup><xref ref-type="table-fn" rid="table4fn2">b</xref></sup><break/>(n, % of sentences)</td><td align="left" valign="bottom">Accuracy (95% CI)</td><td align="left" valign="bottom">Macro <italic>F</italic><sub>1</sub><sup><xref ref-type="table-fn" rid="table4fn3">c</xref></sup> (95% CI)</td><td align="left" valign="bottom">PPV<sup><xref ref-type="table-fn" rid="table4fn4">d</xref></sup> for label 1 (95% CI)</td><td align="left" valign="bottom">Recall for label 1 (95% CI)</td><td align="left" valign="bottom">F1 for label 1 (95% CI)</td></tr></thead><tbody><tr><td align="left" valign="top" colspan="7">LLMs<sup><xref ref-type="table-fn" rid="table4fn5">e</xref></sup> (name, parameters)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Qwen3 (0.6B)</td><td align="left" valign="top">81 (2.33)</td><td align="left" valign="top">0.783 (0.767&#x2010;0.802)</td><td align="left" valign="top">0.761 (0.743&#x2010;0.780)</td><td align="left" valign="top">0.919 (0.899&#x2010;0.937)</td><td align="left" valign="top">0.699 (0.671&#x2010;0.725)</td><td align="left" valign="top">0.794 (0.774&#x2010;0.813)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Qwen3 (1.7B)</td><td align="left" valign="top">121 (3.48)</td><td align="left" valign="top">0.797 (0.780&#x2010;0.815)</td><td align="left" valign="top">0.776 (0.759&#x2010;0.794)</td><td align="left" valign="top">0.932 (0.916&#x2010;0.948)</td><td align="left" valign="top">0.701 (0.673&#x2010;0.729)</td><td align="left" valign="top">0.800 (0.779&#x2010;0.822)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Qwen3 (4B)</td><td align="left" valign="top">19 (0.55)</td><td align="left" valign="top">0.813 (0.796&#x2010;0.829)</td><td align="left" valign="top">0.790 (0.771&#x2010;0.808)</td><td align="left" valign="top">0.952 (0.939&#x2010;0.964)</td><td align="left" valign="top">0.721 (0.695&#x2010;0.746)</td><td align="left" valign="top">0.821 (0.802&#x2010;0.839)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Llama 3.2 (1B)</td><td align="left" valign="top">260 (7.48)</td><td align="left" valign="top">0.797 (0.779&#x2010;0.813)</td><td align="left" valign="top">0.773 (0.754&#x2010;0.790)</td><td align="left" valign="top">0.916 (0.897&#x2010;0.933)</td><td align="left" valign="top">0.724 (0.697&#x2010;0.751)</td><td align="left" valign="top">0.808 (0.789&#x2010;0.829)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Llama 3.2 (3B)</td><td align="left" valign="top">70 (2.01)</td><td align="left" valign="top">0.806 (0.788&#x2010;0.822)</td><td align="left" valign="top">0.782 (0.763&#x2010;0.800)</td><td align="left" valign="top">0.919 (0.901&#x2010;0.935)</td><td align="left" valign="top">0.733 (0.707&#x2010;0.759)</td><td align="left" valign="top">0.816 (0.797&#x2010;0.834)</td></tr><tr><td align="left" valign="top" colspan="7">Text classification models</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>BERT<sup><xref ref-type="table-fn" rid="table4fn6">f</xref></sup> base Japanese v3</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">0.788 (0.771&#x2010;0.804)</td><td align="left" valign="top">0.770 (0.752&#x2010;0.786)</td><td align="left" valign="top">0.897 (0.877&#x2010;0.916)</td><td align="left" valign="top">0.716 (0.688&#x2010;0.743)</td><td align="left" valign="top">0.796 (0.774&#x2010;0.816)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>JMedRoBERTa<sup><xref ref-type="table-fn" rid="table4fn7">g</xref></sup>-base</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">0.799 (0.781&#x2010;0.815)</td><td align="left" valign="top">0.783 (0.765&#x2010;0.799)</td><td align="left" valign="top">0.932 (0.915&#x2010;0.948)</td><td align="left" valign="top">0.709 (0.680&#x2010;0.737)</td><td align="left" valign="top">0.806 (0.786&#x2010;0.826)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>ModernBERT-Ja-130M</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">0.801 (0.784&#x2010;0.818)</td><td align="left" valign="top">0.782 (0.764&#x2010;0.799)</td><td align="left" valign="top">0.917 (0.898&#x2010;0.934)</td><td align="left" valign="top">0.722 (0.697&#x2010;0.748)</td><td align="left" valign="top">0.808 (0.790&#x2010;0.827)</td></tr></tbody></table><table-wrap-foot><fn id="table4fn1"><p><sup>a</sup>95% CIs were estimated using the bootstrap method.</p></fn><fn id="table4fn2"><p><sup>b</sup>Parsing errors: failures in JSON parsing during output generation.</p></fn><fn id="table4fn3"><p><sup>c</sup>Macro <italic>F</italic><sub>1</sub>: macro-averaged <italic>F</italic><sub>1</sub>.</p></fn><fn id="table4fn4"><p><sup>d</sup>PPV: positive predictive value.</p></fn><fn id="table4fn5"><p><sup>e</sup>LLM: large language model.</p></fn><fn id="table4fn6"><p><sup>f</sup>BERT: bidirectional encoder representations from transformers.</p></fn><fn id="table4fn7"><p><sup>g</sup>JMedRoBERTa: Japanese Medical Robustly Optimized BERT Pretraining Approach.</p></fn></table-wrap-foot></table-wrap><p><xref ref-type="table" rid="table5">Table 5</xref> summarizes the accuracy and sentence-level parsing errors for each institution. In institution C, which exhibited a notably high incidence of parsing errors, a detailed review of 40 reports revealed that all reports with failed parsing contained specific notations such as &#x201C;[#1]&#x201D; or &#x201C;[#1&#x2010;2]&#x201D; within the text. These markers appeared to be used for referencing attached images, potentially reflecting a reporting style specific to that institution or individual radiologists.</p><p><xref ref-type="fig" rid="figure3">Figure 3</xref> shows the confusion matrices of Qwen3 (4B), the top-performing model, for the synthetic test set and external validation datasets.</p><p>An error analysis was conducted on the predictions of Qwen3 (4B) to identify factors contributing to the degradation of PPV_1. Among sentences incorrectly predicted as label 1, 3 recurring patterns were observed: (1) for label 0, 80% (8/10) of errors involved mistaking patient status or treatment history for active findings; (2) for label 2, 44% (4/9) of errors occurred in sentences mentioning a lesion name with weak negation or stability descriptors (eg, &#x201C;unclear&#x201D; or &#x201C;maintained reduction&#x201D;); and (3) for label 3, 41% (14/34) of misclassifications involved sentences describing differential diagnoses related to findings in the preceding text.</p><table-wrap id="t5" position="float"><label>Table 5.</label><caption><p>Accuracy and parsing errors by institution<sup><xref ref-type="table-fn" rid="table5fn1">a</xref></sup>.</p></caption><table id="table5" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Model/institution</td><td align="left" valign="bottom">A</td><td align="left" valign="bottom">B</td><td align="left" valign="bottom">C</td><td align="left" valign="bottom">D</td><td align="left" valign="bottom">E</td><td align="left" valign="bottom">F</td><td align="left" valign="bottom">G</td></tr></thead><tbody><tr><td align="left" valign="top" colspan="8">LLMs<sup><xref ref-type="table-fn" rid="table5fn2">b</xref></sup> (name, parameters)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Qwen3 (0.6B)</td><td align="left" valign="top">0.835 (1)</td><td align="left" valign="top">0.807 (3)</td><td align="left" valign="top">0.737 (2)</td><td align="left" valign="top">0.810 (1)</td><td align="left" valign="top">0.766 (4)</td><td align="left" valign="top">0.806 (9)</td><td align="left" valign="top">0.696</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Qwen3 (1.7B)</td><td align="left" valign="top">0.842 (2)</td><td align="left" valign="top">0.797 (1)</td><td align="left" valign="top">0.746 (8)</td><td align="left" valign="top">0.858 (2)</td><td align="left" valign="top">0.786</td><td align="left" valign="top">0.814 (5)</td><td align="left" valign="top">0.724</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Qwen3 (4B)</td><td align="left" valign="top">0.855</td><td align="left" valign="top"><italic>0.816</italic></td><td align="left" valign="top"><italic>0.754</italic> (1)</td><td align="left" valign="top"><italic>0.884</italic> (1)</td><td align="left" valign="top"><italic>0.805</italic></td><td align="left" valign="top">0.817 (1)</td><td align="left" valign="top">0.742</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Llama 3.2 (1B)</td><td align="left" valign="top">0.853 (3)</td><td align="left" valign="top">0.810 (5)</td><td align="left" valign="top">0.722 (11)</td><td align="left" valign="top">0.835 (5)</td><td align="left" valign="top">0.805 (4)</td><td align="left" valign="top">0.806 (8)</td><td align="left" valign="top">0.719</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Llama 3.2 (3B)</td><td align="left" valign="top"><italic>0.857</italic></td><td align="left" valign="top">0.808 (6)</td><td align="left" valign="top">0.720 (6)</td><td align="left" valign="top">0.867 (1)</td><td align="left" valign="top">0.786 (1)</td><td align="left" valign="top"><italic>0.826</italic> (4)</td><td align="left" valign="top">0.734</td></tr><tr><td align="left" valign="top" colspan="8">Text classification models</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>BERT<sup><xref ref-type="table-fn" rid="table5fn3">c</xref></sup> base Japanese v3</td><td align="left" valign="top">0.844</td><td align="left" valign="top">0.81</td><td align="left" valign="top">0.712</td><td align="left" valign="top">0.77</td><td align="left" valign="top">0.783</td><td align="left" valign="top">0.813</td><td align="left" valign="top">0.74</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>JMedRoBERTa<sup><xref ref-type="table-fn" rid="table5fn4">d</xref></sup>-base</td><td align="left" valign="top">0.844</td><td align="left" valign="top">0.814</td><td align="left" valign="top">0.74</td><td align="left" valign="top">0.796</td><td align="left" valign="top">0.793</td><td align="left" valign="top">0.815</td><td align="left" valign="top"><italic>0.755</italic></td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>ModernBERT-Ja-130M</td><td align="left" valign="top">0.841</td><td align="left" valign="top">0.802</td><td align="left" valign="top">0.709</td><td align="left" valign="top">0.834</td><td align="left" valign="top">0.8</td><td align="left" valign="top">0.822</td><td align="left" valign="top">0.753</td></tr></tbody></table><table-wrap-foot><fn id="table5fn1"><p><sup>a</sup>Numbers indicate accuracy; numbers in parentheses indicate the number of report-level parsing errors. Italicized values indicate the highest accuracy for each institution.</p></fn><fn id="table5fn2"><p><sup>b</sup>LLM: large language model.</p></fn><fn id="table5fn3"><p><sup>c</sup>BERT: Bidirectional Encoder Representations from Transformers.</p></fn><fn id="table5fn4"><p><sup>d</sup>JMedRoBERTa: Japanese Medical Robustly Optimized BERT Pretraining Approach.</p></fn></table-wrap-foot></table-wrap><fig position="float" id="figure3"><label>Figure 3.</label><caption><p>Performance of Qwen3 (4B) on the (A) internal synthetic test set and (B) external validation datasets. Labels represent the following categories: 0=background, 1=positive findings, 2=negative findings, and 3=continuation. Per-class precision/recall on the synthetic test set: Label 0 = 0.967/0.958; Label 1 = 0.957/0.898; Label 2 = 0.967/0.984; Label 3 = 0.873/0.908. On the external validation dataset: Label 0 = 0.739/0.947; Label 1 = 0.952/0.721; Label 2 = 0.847/0.933; and Label 3 = 0.561/0.692.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v28i1e86365_fig03.png"/></fig></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Findings</title><p>In this study, we developed context-aware sentence classification models for Japanese radiology reports using synthetic data and evaluated the models on multi-institutional radiology reports. All models demonstrated high performance on the internal synthetic test set (accuracy: 0.938&#x2010;0.951; macro <italic>F</italic><sub>1</sub>-score: 0.924&#x2010;0.940). Although performance declined on the external validation dataset (accuracy: 0.783&#x2010;0.813; macro <italic>F</italic><sub>1</sub>-score: 0.761&#x2010;0.790), PPV_1 remained stable and comparable to the internal results (eg, 0.957 vs 0.952 for Qwen3 [4B]). This discrepancy suggests that while the distributional gap between synthetic and real-world reports posed challenges for overall classification, the models effectively captured the core linguistic patterns and medical terminology associated with positive findings. From a practical standpoint, the stability of PPV_1 is particularly important, as it indicates that models trained solely on synthetic data can reliably identify findings from diverse clinical reports without a high rate of false positives. This robustness supports the quality of the resulting image-text pairs, which is a prerequisite for the efficient large-scale development of VLMs. Although PPV_1 remained stable across datasets, Recall_1 declined from 0.898 to 0.960 on the synthetic test set to 0.699 to 0.733 on the external validation dataset, indicating that approximately 27% to 30% of true positive findings were not captured. In the context of VLM dataset construction, this precision-recall balance has practical implications. False-positive image-text pairs, where a nonfinding sentence is incorrectly paired with an image, can actively introduce noise into model training. In contrast, false negatives reduce dataset coverage but do not compromise the quality of the retained pairs. Therefore, the observed high-precision, moderate-recall profile is considered to be acceptable for the initial construction of reliable image-text datasets, while improving recall through prompt refinement and domain adaptation remains an important direction for future work.</p></sec><sec id="s4-2"><title>Prior Work and the Sentence-Level Approach</title><p>Previous studies in radiology report that structuring has predominantly utilized entity-level extraction (eg, extracting specific disease names) or graph-based representations involving nodes and edges [<xref ref-type="bibr" rid="ref36">36</xref>-<xref ref-type="bibr" rid="ref38">38</xref>]. These paradigms offer distinct advantages: entity-level extraction allows for the highly granular identification of specific clinical findings and facilitates the precise handling of negation and uncertainty expressions, while graph-based approaches excel at representing complex relationships between anatomical locations and pathologies. More recently, NLP applications have expanded to include the labeling of anatomical phrases, paraphrasing of clinical statements, and full report structuring [<xref ref-type="bibr" rid="ref37">37</xref>,<xref ref-type="bibr" rid="ref39">39</xref>,<xref ref-type="bibr" rid="ref40">40</xref>].</p><p>In contrast, this study adopted a sentence-level labeling approach. Although this method faces challenges with sentences containing multiple distinct findings (eg, &#x201C;A liver cyst is present, but no renal cyst is observed&#x201D;) [<xref ref-type="bibr" rid="ref41">41</xref>], our policy prioritized label 1. In many clinical instances, positive and negative statements coexisting in a single sentence refer to the same lesion (eg, &#x201C;A hypodense lesion is present in the liver, but it shows no corresponding contrast enhancement&#x201D;). Our framework categorizes such descriptions as label 1. If the clauses in this example were split into 2 separate sentences, they would be labeled as label 1 followed by label 3. By utilizing the combination of label 1 and its associated label 3 in downstream tasks, nearly equivalent semantic information can be obtained regardless of whether the description is contained within a single sentence or distributed across multiple sentences. Therefore, we argue that this approach remains robust for practical clinical applications. As a preliminary analysis, we examined the potential impact of multiclause sentences on the classification performance using Qwen3 (4B) predictions on the external validation dataset; the results are detailed in Section F in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>. While implementing an LLM-based preprocessing step to segment complex sentences into distinct, meaning-preserving statements could further refine this pipeline, we deferred this in this study to avoid increasing procedural complexity and the difficulty of establishing evaluation metrics.</p><p>More importantly, we believe that preserving sentence-level expressions&#x2014;rather than reducing them to simple entity extraction&#x2014;is highly beneficial for VLM training. This approach retains the nuanced phrasing and the radiologist&#x2019;s diagnostic thought process. Preserving these linguistic characteristics provides richer textual information for the model to learn the association between visual findings and professional reporting styles, which is essential for developing clinically relevant models [<xref ref-type="bibr" rid="ref4">4</xref>,<xref ref-type="bibr" rid="ref42">42</xref>].</p></sec><sec id="s4-3"><title>Challenges in Clinical Diversity and Prompt Optimization</title><p>The variability in performance across institutions (<xref ref-type="table" rid="table5">Table 5</xref>) provides key insights for implementation. <xref ref-type="table" rid="table2">Table 2</xref> reveals significant diversity in report length and label distribution. Notably, facilities with metrics similar to the synthetic data&#x2014;such as report length and finding proportions&#x2014;did not always yield the highest accuracy. This suggests that clinical diversity involves complex factors beyond aggregate metrics, including disease prevalence, institution-specific rules, and individual radiologist styles. Although our prompts incorporated various elements, they do not yet fully align with real-world practice. Potential refinements include adjusting demographic granularity (eg, using numerical values instead of categorical descriptors), reweighting clinical scenarios (eg, disease frequency, emergency vs inpatient settings, or the prevalence of poor image quality and motion artifacts), and fine-tuning class distributions (eg, the ratio of positive to negative findings). Future research must refine prompt design to address this reporting diversity, which remains the primary challenge in utilizing a purely synthetic training set.</p><p>Error analyses identified several technical and clinical challenges. Parsing errors in institution C likely resulted from image-referencing notations such as &#x201C;[#1],&#x201D; which the LLM may have misinterpreted as structural control characters. For Qwen3 (4B), false-positive label 1 predictions revealed specific patterns: (1) 80% (8/10) of label 0 errors involved mistaking patient history for active findings; (2) 44% (4/9) of label 2 errors involved lesion names paired with weak negations or stability descriptors, such as &#x201C;unclear&#x201D; or &#x201C;maintained reduction&#x201D;; and (3) 41% (14/34) of label 3 errors occurred in sentences describing differential diagnoses related to preceding findings. These results emphasize the importance of incorporating complex negations, facility-specific symbols, and sophisticated clinical contexts into future synthetic data prompts [<xref ref-type="bibr" rid="ref43">43</xref>,<xref ref-type="bibr" rid="ref44">44</xref>]. While this study focused on validating general extraction performance, institutional domain adaptation and post hoc prompt adjustments remain subjects for future investigation.</p></sec><sec id="s4-4"><title>Practical Considerations for Clinical Deployment</title><p>Text classification models, including BERT base Japanese v3, JMedRoBERTa-base, and ModernBERT-Ja-130M, showed stable performance compared to those of small-scale LLMs. From a practical standpoint, these classification models offer distinct advantages in reliability and processing speed (detailed in Section D in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). Their architecture inherently eliminates the risk of format instabilities while ensuring high throughput even on standard hardware [<xref ref-type="bibr" rid="ref45">45</xref>]. While requiring more computational resources than BERT-based models, the decision to evaluate small-scale LLMs was similarly rooted in the need for on-premise feasibility. These models with under 4 billion parameters can operate on a single mid-range GPU with 12 to 24 GB of Video RAM (eg, NVIDIA RTX 4090 or RTX 4070 Ti Super). This allows institutions to maintain strict data privacy and ensure operational stability without relying on external cloud-based services.</p><p>Parsing errors remained a unique challenge to the LLM group [<xref ref-type="bibr" rid="ref46">46</xref>]. While we considered using vLLM, an open-source library optimized for high-throughput LLM serving, to implement JSON mode, we intentionally opted against it. JSON mode is a constrained decoding technique provided by the inference framework rather than a native capability of the LLMs themselves. Our primary objective was to evaluate the models&#x2019; inherent ability to follow structural instructions without external enforcement. Furthermore, we found that such rigid constraints could lead to unintended text alterations or premature output truncation. We also investigated whether a re-inference strategy could resolve these parsing errors, with the results detailed in Section E in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p></sec><sec id="s4-5"><title>Limitations</title><p>This study had some limitations. First, most training labels in the synthetic data were generated automatically without individual human verification. While a quality assessment of the synthetic test set, comprising 1124 sentences from 100 reports, showed high reliability (&#x03BA;=0.917), the remaining unverified training data may contain annotation noise that potentially constrained the models&#x2019; final performance. The iterative refinement of these labels through expert review and subsequent fine-tuning could represent a promising avenue to further enhance model accuracy and robustness.</p><p>Second, although Japanese served as a representative example of a low-resource language in the medical domain, the generalizability of this approach to other underrepresented languages has not yet been verified. As noted in the section &#x201C;Challenges in Clinical Diversity and Prompt Optimization,&#x201D; there remains room for improvement in our synthetic data generation process. Applying this methodology to other languages would necessitate additional tuning to align with specific linguistic nuances and localized clinical reporting conventions.</p><p>Third, the comparison between LLMs and text classification models involved differing input contexts. LLMs processed full reports to leverage their expansive context windows and maximize throughput. In contrast, text classification models utilized a 5-sentence sliding window due to inherent input length constraints. Restricting LLMs to identical local windows would have artificially limited their native processing capacity and practical utility. Therefore, these results should be interpreted as a comparison of each model in its optimal configuration rather than under strictly uniform input conditions.</p></sec><sec id="s4-6"><title>Future Directions</title><p>Future work should focus on integrating expert-annotated real-world reports for iterative model fine-tuning and establishing a more robust pipeline. This includes the implementation of advanced preprocessing steps, such as LLM-based sentence segmentation, to accurately handle descriptions containing multiple clinical findings. Furthermore, a primary objective will be to generate high-quality image-text pairs from these structured reports to facilitate the development of medical vision-language foundation models.</p></sec><sec id="s4-7"><title>Conclusions</title><p>This study demonstrated the feasibility of developing context-aware sentence classification models for Japanese radiology reports using a training pipeline based entirely on synthetic data, thereby substantially reducing the need for labor-intensive manual labeling. Across multiple model architectures, high classification performance was achieved on the synthetic test set. Although external validation using multi-institutional, real-world reports showed a consistent pattern of performance degradation, the models maintained a stable positive predictive value for positive findings. These results indicate that the models effectively captured the essential clinical terminology and linguistic patterns required to identify positive imaging findings in real-world reports.</p></sec></sec></body><back><ack><p>The authors would like to thank the departments of radiology that provided the Japan Medical Image Database (J-MID), including Juntendo University, Kyushu University, Keio University, The University of Tokyo, Okayama University, Kyoto University, Osaka University, Tokyo Medical and Dental University, Hokkaido University, Ehime University, and Tokushima University. Artificial intelligence&#x2013;assisted technologies (ChatGPT; OpenAI) were used for English language editing. The authors take full responsibility for the content of the article.</p></ack><notes><sec><title>Funding</title><p>This research was supported by Accreditation Organization for Management of Radiologic Imaging (AOMRI) Research Grant 2025 and by Cross-ministerial Strategic Innovation Promotion Program (SIP) on &#x201C;Integrated Health Care System&#x201D; (grant JPJ012425).</p></sec><sec><title>Data Availability</title><p>The synthetic data with the annotation scripts and the three fine-tuned text classification models used in this study are available via a GitHub repository [<xref ref-type="bibr" rid="ref47">47</xref>]. The original clinical data obtained from the Japan Medical Image Database (J-MID) cannot be shared publicly due to ethical and privacy restrictions.</p></sec></notes><fn-group><fn fn-type="con"><p>T Kikuchi conceived and designed the study, performed data curation, formal analysis, and investigation, developed the methodology, administered the project, created the visualizations, and drafted the manuscript. YY contributed to conceptualization, formal analysis, investigation, and methodology. KY contributed to conceptualization, data curation, and methodology. TA contributed to data curation and critical revision of the manuscript. H Mori, H Makimoto, and T Kohro critically reviewed and edited the manuscript. All authors reviewed the final version of the manuscript, approved its submission, and agreed to be accountable for all aspects of this work.</p></fn><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">BERT</term><def><p>Bidirectional Encoder Representations from Transformers</p></def></def-item><def-item><term id="abb2">GPU</term><def><p>graphics processing unit</p></def></def-item><def-item><term id="abb3">J-MID</term><def><p>Japan Medical Image Database</p></def></def-item><def-item><term id="abb4">JMedRoBERTa</term><def><p>Japanese Medical Robustly Optimized BERT Pretraining Approach</p></def></def-item><def-item><term id="abb5">LLM</term><def><p>large language model</p></def></def-item><def-item><term id="abb6">LoRA</term><def><p>low-rank adaptation</p></def></def-item><def-item><term id="abb7">Macro <italic>F</italic><sub>1</sub></term><def><p>macro-averaged <italic>F</italic><sub>1</sub></p></def></def-item><def-item><term id="abb8">MIMIC-CXR</term><def><p>Medical Information Mart for Intensive Care Chest X-Ray</p></def></def-item><def-item><term id="abb9">NLP</term><def><p>natural language processing</p></def></def-item><def-item><term id="abb10">PPV</term><def><p> positive predictive value</p></def></def-item><def-item><term id="abb11">QLoRA</term><def><p> quantized low-rank adaptation</p></def></def-item><def-item><term id="abb12">RoBERTa</term><def><p>Robustly Optimized BERT Pretraining Approach</p></def></def-item><def-item><term id="abb13">VLM</term><def><p>vision-language model</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zhang</surname><given-names>K</given-names> </name><name name-style="western"><surname>Zhou</surname><given-names>R</given-names> </name><name name-style="western"><surname>Adhikarla</surname><given-names>E</given-names> </name><etal/></person-group><article-title>A generalist vision-language foundation model for diverse biomedical tasks</article-title><source>Nat Med</source><year>2024</year><month>11</month><volume>30</volume><issue>11</issue><fpage>3129</fpage><lpage>3141</lpage><pub-id pub-id-type="doi">10.1038/s41591-024-03185-2</pub-id><pub-id pub-id-type="medline">39112796</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sun</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Wen</surname><given-names>X</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>Y</given-names> </name><etal/></person-group><article-title>Visual-language foundation models in medical imaging: a systematic review and meta-analysis of diagnostic and analytical applications</article-title><source>Comput Methods Programs Biomed</source><year>2025</year><month>08</month><volume>268</volume><fpage>108870</fpage><pub-id pub-id-type="doi">10.1016/j.cmpb.2025.108870</pub-id><pub-id pub-id-type="medline">40424873</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Danish</surname><given-names>S</given-names> </name><name name-style="western"><surname>Sadeghi-Niaraki</surname><given-names>A</given-names> </name><name name-style="western"><surname>Khan</surname><given-names>SU</given-names> </name><name name-style="western"><surname>Dang</surname><given-names>LM</given-names> </name><name name-style="western"><surname>Tightiz</surname><given-names>L</given-names> </name><name name-style="western"><surname>Moon</surname><given-names>H</given-names> </name></person-group><article-title>A comprehensive survey of vision&#x2013;language models: pretrained models, fine-tuning, prompt engineering, adapters, and benchmark datasets</article-title><source>Information Fusion</source><year>2026</year><month>02</month><volume>126</volume><fpage>103623</fpage><pub-id pub-id-type="doi">10.1016/j.inffus.2025.103623</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Yi</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Xiao</surname><given-names>T</given-names> </name><name name-style="western"><surname>Albert</surname><given-names>MV</given-names> </name></person-group><article-title>A survey on multimodal large language models in radiology for report generation and visual question answering</article-title><source>Information</source><year>2025</year><month>02</month><day>12</day><volume>16</volume><issue>2</issue><fpage>136</fpage><pub-id pub-id-type="doi">10.3390/info16020136</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Yamamoto</surname><given-names>K</given-names> </name><name name-style="western"><surname>Kikuchi</surname><given-names>T</given-names> </name></person-group><article-title>TotalFM: an organ-separated framework for 3D-CT vision foundation models</article-title><source>arXiv</source><comment>Preprint posted online on  Jan 1, 2026</comment><pub-id pub-id-type="doi">10.48550/arXiv.2601.00260</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hartsock</surname><given-names>I</given-names> </name><name name-style="western"><surname>Rasool</surname><given-names>G</given-names> </name></person-group><article-title>Vision-language models for medical report generation and visual question answering: a review</article-title><source>Front Artif Intell</source><year>2024</year><volume>7</volume><fpage>1430984</fpage><pub-id pub-id-type="doi">10.3389/frai.2024.1430984</pub-id><pub-id pub-id-type="medline">39628839</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ryu</surname><given-names>JS</given-names> </name><name name-style="western"><surname>Kang</surname><given-names>H</given-names> </name><name name-style="western"><surname>Chu</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>S</given-names> </name></person-group><article-title>Vision-language foundation models for medical imaging: a review of current practices and innovations</article-title><source>Biomed Eng Lett</source><year>2025</year><month>09</month><volume>15</volume><issue>5</issue><fpage>809</fpage><lpage>830</lpage><pub-id pub-id-type="doi">10.1007/s13534-025-00484-6</pub-id><pub-id pub-id-type="medline">40917147</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Moor</surname><given-names>M</given-names> </name><name name-style="western"><surname>Banerjee</surname><given-names>O</given-names> </name><name name-style="western"><surname>Abad</surname><given-names>ZSH</given-names> </name><etal/></person-group><article-title>Foundation models for generalist medical artificial intelligence</article-title><source>Nature New Biol</source><year>2023</year><month>04</month><day>13</day><volume>616</volume><issue>7956</issue><fpage>259</fpage><lpage>265</lpage><pub-id pub-id-type="doi">10.1038/s41586-023-05881-4</pub-id><pub-id pub-id-type="medline">37045921</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Johnson</surname><given-names>AEW</given-names> </name><name name-style="western"><surname>Pollard</surname><given-names>TJ</given-names> </name><name name-style="western"><surname>Berkowitz</surname><given-names>SJ</given-names> </name><etal/></person-group><article-title>MIMIC-CXR, a de-identified publicly available database of chest radiographs with free-text reports</article-title><source>Sci Data</source><year>2019</year><month>12</month><day>12</day><volume>6</volume><issue>1</issue><fpage>317</fpage><pub-id pub-id-type="doi">10.1038/s41597-019-0322-0</pub-id><pub-id pub-id-type="medline">31831740</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hamamci</surname><given-names>IE</given-names> </name><name name-style="western"><surname>Er</surname><given-names>S</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>C</given-names> </name><etal/></person-group><article-title>Generalist foundation models from a multimodal dataset for 3D computed tomography</article-title><source>Nat Biomed Eng</source><year>2026</year><month>02</month><day>12</day><pub-id pub-id-type="doi">10.1038/s41551-025-01599-y</pub-id><pub-id pub-id-type="medline">41680439</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Yamagishi</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Nakamura</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Kikuchi</surname><given-names>T</given-names> </name><etal/></person-group><article-title>Development of a large-scale dataset of chest computed tomography reports in Japanese and a high-performance finding classification model: dataset development and validation study</article-title><source>JMIR Med Inform</source><year>2025</year><month>08</month><day>28</day><volume>13</volume><fpage>e71137</fpage><pub-id pub-id-type="doi">10.2196/71137</pub-id><pub-id pub-id-type="medline">40874833</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sourlos</surname><given-names>N</given-names> </name><name name-style="western"><surname>Vliegenthart</surname><given-names>R</given-names> </name><name name-style="western"><surname>Santinha</surname><given-names>J</given-names> </name><etal/></person-group><article-title>Recommendations for the creation of benchmark datasets for reproducible artificial intelligence in radiology</article-title><source>Insights Imaging</source><year>2024</year><month>10</month><day>14</day><volume>15</volume><issue>1</issue><fpage>248</fpage><pub-id pub-id-type="doi">10.1186/s13244-024-01833-2</pub-id><pub-id pub-id-type="medline">39400639</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hirano</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Hanaoka</surname><given-names>S</given-names> </name><name name-style="western"><surname>Nakao</surname><given-names>T</given-names> </name><etal/></person-group><article-title>GPT-4 Turbo with Vision fails to outperform text-only GPT-4 Turbo in the Japan Diagnostic Radiology Board Examination</article-title><source>Jpn J Radiol</source><year>2024</year><month>08</month><volume>42</volume><issue>8</issue><fpage>918</fpage><lpage>926</lpage><pub-id pub-id-type="doi">10.1007/s11604-024-01561-z</pub-id><pub-id pub-id-type="medline">38733472</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hirano</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Hanaoka</surname><given-names>S</given-names> </name><name name-style="western"><surname>Nakao</surname><given-names>T</given-names> </name><etal/></person-group><article-title>No improvement found with GPT-4o: results of additional experiments in the Japan Diagnostic Radiology Board Examination</article-title><source>Jpn J Radiol</source><year>2024</year><month>11</month><volume>42</volume><issue>11</issue><fpage>1352</fpage><lpage>1353</lpage><pub-id pub-id-type="doi">10.1007/s11604-024-01622-3</pub-id><pub-id pub-id-type="medline">38937409</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bustos</surname><given-names>A</given-names> </name><name name-style="western"><surname>Pertusa</surname><given-names>A</given-names> </name><name name-style="western"><surname>Salinas</surname><given-names>JM</given-names> </name><name name-style="western"><surname>de la Iglesia-Vay&#x00E1;</surname><given-names>M</given-names> </name></person-group><article-title>Padchest: A large chest x-ray image dataset with multi-label annotated reports</article-title><source>Med Image Anal</source><year>2020</year><month>12</month><volume>66</volume><fpage>101797</fpage><pub-id pub-id-type="doi">10.1016/j.media.2020.101797</pub-id><pub-id pub-id-type="medline">32877839</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>de Castro</surname><given-names>DC</given-names> </name><name name-style="western"><surname>Bustos</surname><given-names>A</given-names> </name><name name-style="western"><surname>Bannur</surname><given-names>S</given-names> </name><etal/></person-group><article-title>PadChest-GR: a bilingual chest X-ray dataset for grounded radiology report generation</article-title><source>NEJM AI</source><year>2025</year><month>06</month><day>26</day><volume>2</volume><issue>7</issue><pub-id pub-id-type="doi">10.1056/AIdbp2401120</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ko&#x00E7;ak</surname><given-names>B</given-names> </name><name name-style="western"><surname>Ponsiglione</surname><given-names>A</given-names> </name><name name-style="western"><surname>Stanzione</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Bias in artificial intelligence for medical imaging: Fundamentals, detection, avoidance, mitigation, challenges, ethics, and prospects</article-title><source>Diagn Interv Radiol</source><year>2025</year><month>03</month><day>3</day><volume>31</volume><issue>2</issue><fpage>75</fpage><lpage>88</lpage><pub-id pub-id-type="doi">10.4274/dir.2024.242854</pub-id><pub-id pub-id-type="medline">38953330</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Mukherjee</surname><given-names>P</given-names> </name><name name-style="western"><surname>Hou</surname><given-names>B</given-names> </name><name name-style="western"><surname>Suri</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Evaluation of GPT large language model performance on RSNA 2023 Case of the Day questions</article-title></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kikuchi</surname><given-names>T</given-names> </name><name name-style="western"><surname>Nakao</surname><given-names>T</given-names> </name><name name-style="western"><surname>Nakamura</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Hanaoka</surname><given-names>S</given-names> </name><name name-style="western"><surname>Mori</surname><given-names>H</given-names> </name><name name-style="western"><surname>Yoshikawa</surname><given-names>T</given-names> </name></person-group><article-title>Toward improved radiologic diagnostics: investigating the utility and limitations of GPT-3.5 Turbo and GPT-4 with quiz cases</article-title><source>AJNR Am J Neuroradiol</source><year>2024</year><month>10</month><day>3</day><volume>45</volume><issue>10</issue><fpage>1506</fpage><lpage>1511</lpage><pub-id pub-id-type="doi">10.3174/ajnr.A8332</pub-id><pub-id pub-id-type="medline">38719605</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Choi</surname><given-names>WC</given-names> </name><name name-style="western"><surname>Chang</surname><given-names>CI</given-names> </name></person-group><article-title>Advantages and limitations of open-source versus commercial large language models (LLMs): a comparative study of DeepSeek and OpenAI&#x2019;s ChatGPT</article-title><source>Preprints.org</source><comment>Preprint posted online on  Mar 14, 2025</comment><pub-id pub-id-type="doi">10.20944/preprints202503.1081.v1</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Chen</surname><given-names>L</given-names> </name><name name-style="western"><surname>Zaharia</surname><given-names>M</given-names> </name><name name-style="western"><surname>Zou</surname><given-names>J</given-names> </name></person-group><article-title>How is ChatGPT's behavior changing over time?</article-title><source>arXiv</source><comment>Preprint posted online on  Jul 18, 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2307.09009</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Belcak</surname><given-names>P</given-names> </name><name name-style="western"><surname>Heinrich</surname><given-names>G</given-names> </name><name name-style="western"><surname>Diao</surname><given-names>S</given-names> </name><etal/></person-group><article-title>Small language models are the future of agentic AI</article-title><source>arXiv</source><comment>Preprint posted online on  Jun 2, 2025</comment><pub-id pub-id-type="doi">10.48550/arXiv.2506.02153</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bumgardner</surname><given-names>VKC</given-names> </name><name name-style="western"><surname>Mullen</surname><given-names>A</given-names> </name><name name-style="western"><surname>Armstrong</surname><given-names>SE</given-names> </name><name name-style="western"><surname>Hickey</surname><given-names>C</given-names> </name><name name-style="western"><surname>Marek</surname><given-names>V</given-names> </name><name name-style="western"><surname>Talbert</surname><given-names>J</given-names> </name></person-group><article-title>Local large language models for complex structured tasks</article-title><source>AMIA Jt Summits Transl Sci Proc</source><year>2024</year><fpage>105</fpage><lpage>114</lpage><pub-id pub-id-type="medline">38827047</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Akashi</surname><given-names>T</given-names> </name><name name-style="western"><surname>Kumamaru</surname><given-names>KK</given-names> </name><name name-style="western"><surname>Wada</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Japan-Medical Image Database (J-MID): medical big data supporting data science</article-title><source>Juntendo Med J</source><year>2025</year><volume>71</volume><issue>3</issue><fpage>166</fpage><lpage>172</lpage><pub-id pub-id-type="doi">10.14789/ejmj.JMJ25-0004-P</pub-id><pub-id pub-id-type="medline">40666496</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Yang</surname><given-names>A</given-names> </name><name name-style="western"><surname>Li</surname><given-names>A</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>B</given-names> </name><etal/></person-group><article-title>Qwen3 technical report</article-title><source>arXiv</source><comment>Preprint posted online on  May 14, 2025</comment><pub-id pub-id-type="doi">10.48550/arXiv.2505.09388</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Grattafiori</surname><given-names>A</given-names> </name><name name-style="western"><surname>Dubey</surname><given-names>A</given-names> </name><name name-style="western"><surname>Jauhri</surname><given-names>A</given-names> </name><etal/></person-group><article-title>The Llama 3 herd of models</article-title><source>arXiv</source><comment>Preprint posted online on  Jul 31, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2407.21783</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Warner</surname><given-names>B</given-names> </name><name name-style="western"><surname>Chaffin</surname><given-names>A</given-names> </name><name name-style="western"><surname>Clavi&#x00E9;</surname><given-names>B</given-names> </name><etal/></person-group><article-title>Smarter, better, faster, longer: a modern bidirectional encoder for fast, memory efficient, and long context finetuning and inference</article-title><conf-name>Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)(ACL 2025)</conf-name><conf-date>Jul 27 to Aug 1, 2025</conf-date><conf-loc>Vienna, Austria</conf-loc><fpage>2526</fpage><lpage>2547</lpage><pub-id pub-id-type="doi">10.18653/v1/2025.acl-long.127</pub-id></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Devlin</surname><given-names>J</given-names> </name><name name-style="western"><surname>Chang</surname><given-names>MW</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>K</given-names> </name><etal/></person-group><article-title>BERT: pre-training of deep bidirectional transformers for language understanding</article-title><source>arXiv</source><comment>Preprint posted online on  Oct 11, 2018</comment><pub-id pub-id-type="doi">10.48550/arXiv.1810.04805</pub-id></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Liu</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Ott</surname><given-names>M</given-names> </name><name name-style="western"><surname>Goyal</surname><given-names>N</given-names> </name><etal/></person-group><article-title>RoBERTa: a robustly optimized BERT pretraining approach</article-title><source>arXiv</source><comment>Preprint posted online on  Jul 26, 2019</comment><pub-id pub-id-type="doi">10.48550/arXiv.1907.11692</pub-id></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Yasaka</surname><given-names>K</given-names> </name><name name-style="western"><surname>Nomura</surname><given-names>T</given-names> </name><name name-style="western"><surname>Kamohara</surname><given-names>J</given-names> </name><etal/></person-group><article-title>Classification of interventional radiology reports into technique categories with a fine-tuned large language model</article-title><source>J Imaging Inform Med</source><year>2025</year><month>10</month><volume>38</volume><issue>5</issue><fpage>3366</fpage><lpage>3374</lpage><pub-id pub-id-type="doi">10.1007/s10278-024-01370-w</pub-id><pub-id pub-id-type="medline">39673010</pub-id></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kanzawa</surname><given-names>J</given-names> </name><name name-style="western"><surname>Yasaka</surname><given-names>K</given-names> </name><name name-style="western"><surname>Fujita</surname><given-names>N</given-names> </name><name name-style="western"><surname>Fujiwara</surname><given-names>S</given-names> </name><name name-style="western"><surname>Abe</surname><given-names>O</given-names> </name></person-group><article-title>Automated classification of brain MRI reports using fine-tuned large language models</article-title><source>Neuroradiology</source><year>2024</year><month>12</month><volume>66</volume><issue>12</issue><fpage>2177</fpage><lpage>2183</lpage><pub-id pub-id-type="doi">10.1007/s00234-024-03427-7</pub-id><pub-id pub-id-type="medline">38995393</pub-id></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Sugimoto</surname><given-names>K</given-names> </name><name name-style="western"><surname>Iki</surname><given-names>T</given-names> </name><name name-style="western"><surname>Chida</surname><given-names>Y</given-names> </name><etal/></person-group><article-title>JMedRoBERTa: a Japanese pre-trained language model on academic articles in medical sciences [Article in Japanese]</article-title><year>2023</year><access-date>2026-03-22</access-date><conf-name>Proceedings of the 29th Annual Meeting of the Association for Natural Language Processing</conf-name><fpage>707</fpage><lpage>712</lpage><comment><ext-link ext-link-type="uri" xlink:href="https://www.anlp.jp/proceedings/annual_meeting/2023/pdf_dir/P3-1.pdf">https://www.anlp.jp/proceedings/annual_meeting/2023/pdf_dir/P3-1.pdf</ext-link></comment></nlm-citation></ref><ref id="ref33"><label>33</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Yamagishi</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Kikuchi</surname><given-names>T</given-names> </name><name name-style="western"><surname>Hanaoka</surname><given-names>S</given-names> </name><etal/></person-group><article-title>ModernBERT is more efficient than conventional BERT for chest CT findings classification in Japanese radiology reports</article-title><source>arXiv</source><comment>Preprint posted online on  Mar 7, 2025</comment><pub-id pub-id-type="doi">10.48550/arXiv.2503.05060</pub-id></nlm-citation></ref><ref id="ref34"><label>34</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Hu</surname><given-names>EJ</given-names> </name><name name-style="western"><surname>Shen</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Wallis</surname><given-names>P</given-names> </name><etal/></person-group><article-title>LoRA: low-rank adaptation of large language models</article-title><source>arXiv</source><comment>Preprint posted online on  Jun 17, 2021</comment><pub-id pub-id-type="doi">10.48550/arXiv.2106.09685</pub-id></nlm-citation></ref><ref id="ref35"><label>35</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Dettmers</surname><given-names>T</given-names> </name><name name-style="western"><surname>Holtzman</surname><given-names>A</given-names> </name><name name-style="western"><surname>Pagnoni</surname><given-names>A</given-names> </name><name name-style="western"><surname>Zettlemoyer</surname><given-names>L</given-names> </name></person-group><article-title>QLoRA: efficient finetuning of quantized LLMs</article-title><conf-name>37th Conference on Neural Information Processing Systems (NeurIPS 2023)</conf-name><conf-date>Dec 10-16, 2023</conf-date><pub-id pub-id-type="doi">10.52202/075280-0441</pub-id></nlm-citation></ref><ref id="ref36"><label>36</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Peng</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>X</given-names> </name><name name-style="western"><surname>Lu</surname><given-names>L</given-names> </name><name name-style="western"><surname>Bagheri</surname><given-names>M</given-names> </name><name name-style="western"><surname>Summers</surname><given-names>R</given-names> </name><name name-style="western"><surname>Lu</surname><given-names>Z</given-names> </name></person-group><article-title>NegBio: a high-performance tool for negation and uncertainty detection in radiology reports</article-title><source>AMIA Jt Summits Transl Sci Proc</source><year>2018</year><volume>2017</volume><fpage>188</fpage><lpage>196</lpage><pub-id pub-id-type="medline">29888070</pub-id></nlm-citation></ref><ref id="ref37"><label>37</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Jain</surname><given-names>S</given-names> </name><name name-style="western"><surname>Agrawal</surname><given-names>A</given-names> </name><name name-style="western"><surname>Saporta</surname><given-names>A</given-names> </name><etal/></person-group><article-title>RadGraph: extracting clinical entities and relations from radiology reports</article-title><access-date>2026-03-17</access-date><conf-name>35th Conference on Neural Information Processing Systems (NeurIPS 2021) Track on Datasets and Benchmarks</conf-name><conf-date>Dec 6-14, 2021</conf-date><comment><ext-link ext-link-type="uri" xlink:href="https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/file/c8ffe9a587b126f152ed3d89a146b445-Paper-round1.pdf">https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/file/c8ffe9a587b126f152ed3d89a146b445-Paper-round1.pdf</ext-link></comment></nlm-citation></ref><ref id="ref38"><label>38</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kahn</surname><given-names>CE</given-names>  <suffix>Jr</suffix></name><name name-style="western"><surname>Langlotz</surname><given-names>CP</given-names> </name><name name-style="western"><surname>Burnside</surname><given-names>ES</given-names> </name><etal/></person-group><article-title>Toward best practices in radiology reporting</article-title><source>Radiology</source><year>2009</year><month>09</month><volume>252</volume><issue>3</issue><fpage>852</fpage><lpage>856</lpage><pub-id pub-id-type="doi">10.1148/radiol.2523081992</pub-id><pub-id pub-id-type="medline">19717755</pub-id></nlm-citation></ref><ref id="ref39"><label>39</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zhu</surname><given-names>H</given-names> </name><name name-style="western"><surname>Paschalidis</surname><given-names>IC</given-names> </name><name name-style="western"><surname>Hall</surname><given-names>C</given-names> </name><name name-style="western"><surname>Tahmasebi</surname><given-names>A</given-names> </name></person-group><article-title>Context-driven concept annotation in radiology reports: anatomical phrase labeling</article-title><source>AMIA Jt Summits Transl Sci Proc</source><year>2019</year><volume>2019</volume><fpage>232</fpage><lpage>241</lpage><pub-id pub-id-type="medline">31258975</pub-id></nlm-citation></ref><ref id="ref40"><label>40</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bergomi</surname><given-names>L</given-names> </name><name name-style="western"><surname>Buonocore</surname><given-names>TM</given-names> </name><name name-style="western"><surname>Antonazzo</surname><given-names>P</given-names> </name><etal/></person-group><article-title>Reshaping free-text radiology notes into structured reports with generative question answering transformers</article-title><source>Artif Intell Med</source><year>2024</year><month>08</month><volume>154</volume><fpage>102924</fpage><pub-id pub-id-type="doi">10.1016/j.artmed.2024.102924</pub-id><pub-id pub-id-type="medline">38964194</pub-id></nlm-citation></ref><ref id="ref41"><label>41</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Fujikawa</surname><given-names>K</given-names> </name><name name-style="western"><surname>Seki</surname><given-names>K</given-names> </name><name name-style="western"><surname>Uehara</surname><given-names>K</given-names> </name></person-group><article-title>A hybrid approach to finding negated and uncertain expressions in biomedical documents</article-title><year>2012</year><conf-name>Proceedings of the 2nd International Workshop on Managing Interoperability and Complexity in Health Systems</conf-name><conf-date>Oct 28 to Nov 2, 2012</conf-date><conf-loc>Maui, Hawaii, USA</conf-loc><fpage>67</fpage><lpage>74</lpage><pub-id pub-id-type="doi">10.1145/2389672.2389685</pub-id></nlm-citation></ref><ref id="ref42"><label>42</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Hamma</surname><given-names>MB</given-names> </name><name name-style="western"><surname>Merrouni</surname><given-names>ZA</given-names> </name><name name-style="western"><surname>Frikh</surname><given-names>B</given-names> </name><name name-style="western"><surname>Ouhbi</surname><given-names>B</given-names> </name></person-group><article-title>Automatic radiology report generation: a comprehensive review and innovative framework</article-title><conf-name>2024 IEEE International Conference on Bioinformatics and Biomedicine (BIBM)</conf-name><conf-date>Dec 3-6, 2024</conf-date><conf-loc>Lisbon, Portugal</conf-loc><fpage>4225</fpage><lpage>4232</lpage><pub-id pub-id-type="doi">10.1109/BIBM62325.2024.10821974</pub-id></nlm-citation></ref><ref id="ref43"><label>43</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Casey</surname><given-names>A</given-names> </name><name name-style="western"><surname>Davidson</surname><given-names>E</given-names> </name><name name-style="western"><surname>Poon</surname><given-names>M</given-names> </name><etal/></person-group><article-title>A systematic review of natural language processing applied to radiology reports</article-title><source>BMC Med Inform Decis Mak</source><year>2021</year><month>06</month><day>3</day><volume>21</volume><issue>1</issue><fpage>179</fpage><pub-id pub-id-type="doi">10.1186/s12911-021-01533-7</pub-id><pub-id pub-id-type="medline">34082729</pub-id></nlm-citation></ref><ref id="ref44"><label>44</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Delbrouck</surname><given-names>JB</given-names> </name><name name-style="western"><surname>Chambon</surname><given-names>P</given-names> </name><name name-style="western"><surname>Bluethgen</surname><given-names>C</given-names> </name><name name-style="western"><surname>Tsai</surname><given-names>E</given-names> </name><name name-style="western"><surname>Almusa</surname><given-names>O</given-names> </name><name name-style="western"><surname>Langlotz</surname><given-names>C</given-names> </name></person-group><article-title>Improving the factual correctness of radiology report generation with semantic rewards</article-title><conf-name>Findings of the Association for Computational Linguistics: EMNLP 2022</conf-name><conf-date>Dec 7-11, 2022</conf-date><conf-loc>Abu Dhabi, United Arab Emirates</conf-loc><fpage>4348</fpage><lpage>4360</lpage><pub-id pub-id-type="doi">10.18653/v1/2022.findings-emnlp.319</pub-id></nlm-citation></ref><ref id="ref45"><label>45</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Zhang</surname><given-names>J</given-names> </name><name name-style="western"><surname>Huang</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>S</given-names> </name><name name-style="western"><surname>Gao</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Hu</surname><given-names>X</given-names> </name></person-group><article-title>Do BERT-like bidirectional models still perform better on text classification in the era of LLMs?</article-title><conf-name>Findings of the Association for Computational Linguistics: EMNLP 2025</conf-name><conf-date>Nov 4-9, 2025</conf-date><conf-loc>Suzhou, China</conf-loc><fpage>18980</fpage><lpage>18989</lpage><pub-id pub-id-type="doi">10.18653/v1/2025.findings-emnlp.1033</pub-id></nlm-citation></ref><ref id="ref46"><label>46</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Geng</surname><given-names>S</given-names> </name><name name-style="western"><surname>Cooper</surname><given-names>H</given-names> </name><name name-style="western"><surname>Moskal</surname><given-names>M</given-names> </name><etal/></person-group><article-title>JSONSchemaBench: a rigorous benchmark of structured outputs for language models</article-title><source>arXiv</source><comment>Preprint posted online on  Jan 18, 2025</comment><pub-id pub-id-type="doi">10.48550/arXiv.2501.10868</pub-id></nlm-citation></ref><ref id="ref47"><label>47</label><nlm-citation citation-type="web"><person-group person-group-type="author"><name name-style="western"><surname>Kikuchi</surname><given-names>T</given-names> </name></person-group><article-title>Context-Aware-Rad-Sentence-Classification</article-title><source>GitHub</source><access-date>2026-03-22</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://github.com/jichi-labo/Context-Aware-Rad-Sentence-Classification">https://github.com/jichi-labo/Context-Aware-Rad-Sentence-Classification</ext-link></comment></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Prompts for synthetic report generation and annotation, model card information, throughput benchmarks, parsing error analysis, and subanalysis of label 1 predictions by sentence complexity.</p><media xlink:href="jmir_v28i1e86365_app1.pdf" xlink:title="PDF File, 296 KB"/></supplementary-material></app-group></back></article>