<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">J Med Internet Res</journal-id><journal-id journal-id-type="publisher-id">jmir</journal-id><journal-id journal-id-type="index">1</journal-id><journal-title>Journal of Medical Internet Research</journal-title><abbrev-journal-title>J Med Internet Res</abbrev-journal-title><issn pub-type="epub">1438-8871</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v28i1e78681</article-id><article-id pub-id-type="doi">10.2196/78681</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Evaluating Encoder and Decoder Models for Extended Clinical Concept Recognition in Japanese Clinical Texts: Comparative Study With Weighted Soft Matching</article-title></title-group><contrib-group><contrib contrib-type="author"><name name-style="western"><surname>Tsukiji</surname><given-names>Yuya</given-names></name><degrees>MMSc</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Kataoka</surname><given-names>Satoshi</given-names></name><degrees>MS</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Itokazu</surname><given-names>Masafumi</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Nagai</surname><given-names>Ryozo</given-names></name><degrees>MD, PhD</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Imai</surname><given-names>Takeshi</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib></contrib-group><aff id="aff1"><institution>Center for Disease Biology and Integrative Medicine, Graduate School of Medicine, The University of Tokyo</institution><addr-line>7-3-1 Hongo, Bunkyo-ku, Clinical Research Center A646, The University of Tokyo Hospital</addr-line><addr-line>Tokyo</addr-line><country>Japan</country></aff><aff id="aff2"><institution>Jichi Medical University</institution><addr-line>Tochigi</addr-line><country>Japan</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Coristine</surname><given-names>Andrew</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Ma</surname><given-names>Chunwei</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Torii</surname><given-names>Manabu</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Takeshi Imai, PhD, Center for Disease Biology and Integrative Medicine, Graduate School of Medicine, The University of Tokyo, 7-3-1 Hongo, Bunkyo-ku, Clinical Research Center A646, The University of Tokyo Hospital, Tokyo, 1138655, Japan, 81 03-5841-3454; <email>imai@m.u-tokyo.ac.jp</email></corresp></author-notes><pub-date pub-type="collection"><year>2026</year></pub-date><pub-date pub-type="epub"><day>14</day><month>5</month><year>2026</year></pub-date><volume>28</volume><elocation-id>e78681</elocation-id><history><date date-type="received"><day>13</day><month>06</month><year>2025</year></date><date date-type="rev-recd"><day>01</day><month>03</month><year>2026</year></date><date date-type="accepted"><day>17</day><month>03</month><year>2026</year></date></history><copyright-statement>&#x00A9; Yuya Tsukiji, Satoshi Kataoka, Masafumi Itokazu, Ryozo Nagai, Takeshi Imai. Originally published in the Journal of Medical Internet Research (<ext-link ext-link-type="uri" xlink:href="https://www.jmir.org">https://www.jmir.org</ext-link>), 14.5.2026. </copyright-statement><copyright-year>2026</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in the Journal of Medical Internet Research (ISSN 1438-8871), is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://www.jmir.org/">https://www.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://www.jmir.org/2026/1/e78681"/><abstract><sec><title>Background</title><p>Extracting medical knowledge for secondary purposes, such as diagnostic support, continues to pose a substantial challenge. Conventional named entity recognition has focused on short terms (eg, genes, diseases, and chemicals), whereas extraction and assessment of longer, complex expressions remain underexplored. Clinically vital concepts, such as diseases, pathologies, symptoms, and findings, often appear as long phrases, and accurate extraction is crucial for applications such as constructing causal knowledge from case reports. Consequently, a framework addressing both short terms and clinically meaningful long phrases&#x2014;termed extended Clinical Concept Recognition (E-CCR)&#x2014;is essential.</p></sec><sec><title>Objective</title><p>This study, the first comprehensive investigation of E-CCR model selection, aimed to identify optimal strategies by comparing encoder versus decoder models and general-purpose versus domain-specific pretraining. We analyzed variation in effectiveness by target length and proposed a novel E-CCR evaluation metric.</p></sec><sec sec-type="methods"><title>Methods</title><p>We evaluated 17 encoder and decoder models using J-CaseMap, a database of approximately 20,000 Japanese case reports annotated with clinical concepts. Performance was primarily assessed using the weighted soft matching score, which penalizes fragmentation of long extraction targets and weights scores by target length to account for the greater difficulty of extracting longer expressions.</p></sec><sec sec-type="results"><title>Results</title><p>On J-CaseMap, JMedDeBERTa(s)&#x2014;an encoder model pretrained on domain-specific medical text&#x2014;achieved the highest mean performance (F1-score=0.758, SD 0.002), with similarly strong results from JMedDeBERTa(c), suggesting comparable performance among the top encoder models. As the fragmentation penalty increased, performance generally declined; however, no consistently severe degradation was observed. On the Medical Report Named Entity Recognition for positive disease dataset, the general-domain DeBERTaV2-base yielded the highest mean F1 score, and differences among the medical-domain JMedDeBERTa(s) and JMedDeBERTa(c) variants were small, suggesting limited benefit of domain-specific pretraining. Overall, under our experimental settings (low-rank adaptation fine-tuning for decoders and full fine-tuning for encoders), encoder models outperformed decoder models, and token classification outperformed our instruction tuning setup.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>Under our experimental setting, encoder-based token classification achieved the highest mean performance on our internal dataset. Differences among the top encoder models were small and should be interpreted as comparable within the uncertainty implied by our annotation review, whereas decoder-based approaches did not surpass encoder-based models in this setup, suggesting that encoder models can deliver high accuracy with fewer parameters and may offer practical advantages in resource-constrained environments. Token classification outperformed instruction tuning for extracting long expressions, whereas instruction tuning was better suited to short terms. Using the weighted soft matching score, we found that performance did not substantially deteriorate as the fragmentation penalty increased, indicating that extracted spans were rarely fragmented. Similar trends in external validation datasets suggest that findings under our setup may generalize to information extraction tasks on Japanese medical text. Further investigation is needed to determine whether these findings hold across other languages and medical document types.</p></sec></abstract><kwd-group><kwd>natural language processing</kwd><kwd>NLP</kwd><kwd>named entity recognition</kwd><kwd>instruction tuning</kwd><kwd>token classification</kwd><kwd>large language model</kwd><kwd>LLM</kwd><kwd>transformer model</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>With advancements in digitization within the medical field, various medical documents, including electronic medical records, case reports, and discharge summaries, are increasingly digitized. There is a growing demand for the secondary use of these digitized documents, such as extracting medical knowledge for diagnostic support, and research in this area is advancing. For example, Japan has developed a diagnostic support system called &#x201C;Difficult Diagnosis Case Search: J-CaseMap&#x201D; [<xref ref-type="bibr" rid="ref1">1</xref>]. J-CaseMap uses a manually constructed knowledge graph created by approximately 150 internists, extracting the causal relationships between diseases and the associated pathophysiological processes and symptoms from approximately 20,000 case reports collected from local chapters of the Japan Society of Internal Medicine. A system has been developed that, through search and inference on this large-scale knowledge graph integrating all causal chains from these cases, presents both a &#x201C;list of differential diagnoses suggested by a given combination of input symptoms and findings&#x201D; and &#x201C;the causal chain leading to those diagnoses.&#x201D; This system has been implemented as a service for members of the Japan Society of Internal Medicine.</p><p>Such diagnostic support systems can assist in searching similar cases, suggesting diseases, and supporting diagnostic reasoning, making them highly valuable for standardizing the quality of medical care. However, constructing and updating the underlying structured knowledge database requires substantial time and effort, posing substantial challenges to efficiency and automation. Consequently, demand for automated methods leveraging large language models (LLMs), which have seen rapid advances in recent years, has increased to structure knowledge from medical documents using computers.</p><p>When constructing a knowledge database of the causal relationships among diseases, pathologies, symptoms, and findings that are considered essential for differential diagnosis from clinical texts (such as case reports), the first crucial step is to extract the terms and phrases representing these key clinical concepts. Conventionally, named entity recognition (NER) has been widely used to extract patient-related information from medical texts, including clinical records, medical literature, and electronic health records [<xref ref-type="bibr" rid="ref2">2</xref>,<xref ref-type="bibr" rid="ref3">3</xref>]. The expressions extracted can then be used for aggregating clinical data, substantiating biomedical findings, supporting clinical decision-making, improving patient care, and enhancing health care management and resource allocation [<xref ref-type="bibr" rid="ref4">4</xref>]. However, much of the prior research has concentrated solely on extracting terms or short expressions (eg, disease names, symptoms, drug names, and treatment methods) [<xref ref-type="bibr" rid="ref5">5</xref>-<xref ref-type="bibr" rid="ref10">10</xref>].</p><p>The clinical concepts critical for differential diagnosis&#x2014;diseases, pathologies, symptoms, and findings&#x2014;are not necessarily confined to single terms or short expressions. For instance, the expression &#x201C;Anti-GAD antibody level was elevated at 3790.0 U/mL&#x201D; in its entirety denotes the concept &#x201C;anti-GAD-positive.&#x201D; If this concept is considered as a node in a causal knowledge graph, then extracting the corresponding long expression from the text becomes essential. Thus, when addressing the task of &#x201C;extracting medical concepts that reflect the clinically meaningful units considered by physicians during diagnosis,&#x201D; a comprehensive extraction framework is needed that targets not only terms (short expressions) but also longer expressions (long expressions) that may span multiple sentences. In this study, this task is referred to as Extended Clinical Concept Recognition (E-CCR). This distinguishes our task from conventional NER, which typically targets shorter named entities such as disease names, drug names, or laboratory items.</p><p>Several studies [<xref ref-type="bibr" rid="ref11">11</xref>-<xref ref-type="bibr" rid="ref13">13</xref>] have investigated Japanese medical text processing, including work on deidentification, document classification, and extraction of relatively short medical entities. However, to our knowledge, no prior study has systematically addressed the extraction of &#x201C;clinically important long expressions&#x201D; that form the nodes of a clinical knowledge graph.</p><p>Previous studies have primarily used evaluation metrics such as ROUGE-1, ROUGE-2, ROUGE-L, recall, precision, and <italic>F</italic><sub>1</sub>-score. ROUGE calculates scores based on string similarity [<xref ref-type="bibr" rid="ref14">14</xref>]. <italic>F</italic><sub>1</sub>-scores have been computed by segmenting annotation schemes (ie, the extraction targets) into tokens [<xref ref-type="bibr" rid="ref2">2</xref>] or characters [<xref ref-type="bibr" rid="ref15">15</xref>]. Scores are typically computed on a token or character basis rather than per extracted unit, as the number of characters in each extraction target may vary. However, when comparing multiple models, token-based metrics should be avoided because each LLM uses its tokenizer and varying tokenizers yield different segmentation granularities. Conversely, even when ROUGE or <italic>F</italic><sub>1</sub>-scores are computed on a character basis, their validity is questionable for long extraction targets. This is because a fragmented extraction of a long expression, although undesirable, may still attain a high score on a character basis, simply because the majority of characters are captured. Therefore, when comparing multiple models that include long expressions as extraction targets, it becomes imperative to develop a new evaluation metric that penalizes fragmented outputs and is unaffected by differences in tokenizers.</p><p>Expression extraction can be approached as either a sequence labeling task or a generation task. Conventionally, many researchers have addressed expression extraction as a sequence labeling task. Techniques such as conditional random fields [<xref ref-type="bibr" rid="ref16">16</xref>] and long short-term memory (LSTM) [<xref ref-type="bibr" rid="ref17">17</xref>] have been proposed, with transformer-based encoder models, particularly those based on Bidirectional Encoder Representations from Transformers (BERT) [<xref ref-type="bibr" rid="ref18">18</xref>], gaining substantial popularity [<xref ref-type="bibr" rid="ref19">19</xref>]. More recently, with the emergence of LLMs, interest in performing expression extraction as a generation task has increased, and various information extraction techniques such as cross-domain learning, zero-shot prompting, in-context learning, supervised fine-tuning, and data augmentation have emerged [<xref ref-type="bibr" rid="ref20">20</xref>]. However, as generation tasks require simultaneous text generation and expression extraction, they are inherently more complex than sequence labeling approaches. Therefore, extracting expressions via generation tasks necessitates careful prompt design and fine-tuning [<xref ref-type="bibr" rid="ref21">21</xref>-<xref ref-type="bibr" rid="ref25">25</xref>].</p><p>However, whether encoder or decoder models are more suitable for expression extraction remains unclear. For example, even when techniques such as designing prompt formats that facilitate the proper noun extraction [<xref ref-type="bibr" rid="ref22">22</xref>], tailoring prompts to specific tasks [<xref ref-type="bibr" rid="ref23">23</xref>], and customizing datasets [<xref ref-type="bibr" rid="ref24">24</xref>] are used, in zero-shot or few-shot settings, decoder models (eg, GPT-3 [<xref ref-type="bibr" rid="ref25">25</xref>], GPT-3.5, and GPT-4 [<xref ref-type="bibr" rid="ref26">26</xref>]) have not outperformed encoder models (eg, BERT [<xref ref-type="bibr" rid="ref23">23</xref>], BioClinicalBERT [<xref ref-type="bibr" rid="ref24">24</xref>], and RoBERTa-Large [<xref ref-type="bibr" rid="ref27">27</xref>]). In contrast, BioNER-LLAMA, a decoder model specialized for the medical domain, has reportedly surpassed the encoder model BioBERT by using instruction tuning on a dataset for proper noun extraction in biomedical contexts and by using carefully designed prompts [<xref ref-type="bibr" rid="ref28">28</xref>]. In another study [<xref ref-type="bibr" rid="ref29">29</xref>], when encoder and decoder models were both used for token classification in expression extraction, LLaMA2 outperformed RoBERTa.</p><p>Additionally, insights into the effectiveness of domain-specific texts for expression extraction in the medical domain remain limited. Previous studies have compared models pretrained on domain-specific medical texts or further adapted to them with models pretrained on general text. For example, BERT_mimic, pretrained on medical texts, demonstrated superior performance compared to a BERT model pretrained on general English texts [<xref ref-type="bibr" rid="ref30">30</xref>]. Similarly, BioBERT, initially pretrained on general English texts and subsequently adapted to medical texts, outperformed BERT [<xref ref-type="bibr" rid="ref31">31</xref>]. In contrast, BioELECTRA, pretrained on a relatively small corpus (0.8 GB) of German medical texts, performed worse than DBMDZ ELECTRA, which was pretrained on general texts. Lentzen et al [<xref ref-type="bibr" rid="ref32">32</xref>] reported that BioGottBERT&#x2014;first pretrained on general texts and then further pretrained on similar medical texts&#x2014;achieved the best performance in some cases. Subies et al [<xref ref-type="bibr" rid="ref33">33</xref>] confirmed the effectiveness of encoder models in Spanish clinical tasks but noted that the best-performing model was not domain adapted; rather, domain-adapted models performed worse than general or multilingual models. Thus, the current insights regarding the effectiveness of domain-adapted models remain inadequate. Moreover, prior studies have primarily focused on extracting short expressions such as drug names, disease names, symptoms, and treatment methods, and evidence on long expression extraction remains limited.</p><p>Therefore, this study aimed to elucidate effective model selection strategies for the E-CCR task by comparing encoder and decoder models, as well as general-purpose models with domain-specific models. We propose a novel evaluation metric tailored for E-CCR that not only incorporates conventional measures but also accounts for penalties when long extraction targets are fragmented. We analyzed how model effectiveness varies with the length of the extraction target.</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Overview</title><p><xref ref-type="fig" rid="figure1">Figure 1</xref> illustrates the flowchart of our research design. We compared and evaluated sequence labeling tasks (token classification) with generation tasks (instruction tuning) using a corpus of clinically essential concepts extracted from case reports. Performance was assessed using a weighted soft matching score, and 17 measures&#x2014;including <italic>F</italic><sub>1</sub>-score, recall, and precision&#x2014;were used for both encoder and decoder models.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Overall study design for comparing encoder and decoder language models on the extended Clinical Concept Recognition task. Japanese clinical case reports from the J-CaseMap database were used as input texts. Seventeen language models (encoder and decoder architectures and instruction tuning vs token classification settings) were fine-tuned in a 5-fold cross-validation framework to extract clinically important concepts from these reports. Model outputs were evaluated using the weighted soft matching score with varying fragmentation penalties (<italic>p</italic>=1, 1.5, 2, 100), along with conventional <italic>F</italic><sub>1</sub>, recall, and precision, followed by qualitative review of output examples.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v28i1e78681_fig01.png"/></fig></sec><sec id="s2-2"><title>Materials</title><p>We used data from the J-CaseMap database, which comprises abstracts from approximately 20,000 case reports presented at local chapters of the Japanese Society of Internal Medicine. Each abstract contains approximately 600 characters. The database was manually constructed by approximately 150 internists, who extracted causal chains of diseases, resulting pathophysiological processes, and associated clinical signs and symptoms from these case reports. The knowledge graph comprises &#x201C;nodes,&#x201D; representing term expressions that are normalized by specific rules, and &#x201C;edges,&#x201D; which represent the causal relationships between them. Although such a knowledge database supports decision systems in differential diagnosis, its construction requires substantial manual effort, thereby necessitating the development of automated methods.</p><p>Using J-CaseMap as an example, the task of automatically constructing a causal chain knowledge base from case reports can be divided into three steps:</p><list list-type="order"><list-item><p>Extraction: identify and extract terms or phrases from the case reports that are considered clinically essential for differential diagnosis (eg, diseases, pathophysiological processes, signs, and symptoms)</p></list-item><list-item><p>Normalization: map the extracted expressions to standardized terminology</p></list-item><list-item><p>Causal inference: infer causal relationships among the normalized expressions</p></list-item></list><p>As shown in <xref ref-type="fig" rid="figure2">Figure 2</xref>, the first critical step in developing an automated approach is extracting clinically essential concepts that form the nodes of the knowledge graph. When these essential clinical concepts appear in the text, they often manifest as long expressions. For instance, the sentence &#x201C;Glucagon&#x8CA0;&#x8377;&#x8A66;&#x9A13;&#x3067;&#x5185;&#x56E0;&#x6027;&#x30A4;&#x30F3;&#x30B9;&#x30EA;&#x30F3;&#x5206;&#x6CCC;&#x53CD;&#x5FDC;&#x306E;&#x4F4E;&#x4E0B;&#x3092;&#x8A8D;&#x3081;&#x305F;&#x201D; might be represented as &#x201C;Glucagon&#x8CA0;&#x8377;&#x8A66;&#x9A13; = &#x5185;&#x56E0;&#x6027;&#x30A4;&#x30F3;&#x30B9;&#x30EA;&#x30F3;&#x5206;&#x6CCC;/&#x4F4E;&#x4E0B;&#x201D; (ie, <italic>&#x201C;The glucagon stimulation test indicated a reduced endogenous insulin secretory response&#x201D; &#x2192; &#x201C;glucagon stimulation test=endogenous insulin secretion/reduced&#x201D;</italic>), constituting a long expression.</p><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>Correspondence between clinically essential expressions in a Japanese case report and the J-CaseMap knowledge graph. The left panel shows an example Japanese case report (with English translation) describing type 1 diabetes complicated by Graves disease. Phrases highlighted in light blue indicate text spans selected by physicians as clinical concepts (eg, diagnoses, symptoms, and test findings) that are important for differential diagnosis. The right panel illustrates how these expressions are normalized as nodes and linked by causal relationships (edges) within the J-CaseMap knowledge graph. These node-defining spans constitute the gold-standard corpus for the extended Clinical Concept Recognition task. GAD: glutamic acid decarboxylase; HbA<sub>1c</sub>: hemoglobin A<sub>1c</sub>; ICA: islet cell antibody; SPIDDM: slowly progressive insulin-dependent diabetes mellitus; TBII: thyrotropin binding inhibitory immunoglobulin.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v28i1e78681_fig02.png"/></fig><p>Using the data from J-CaseMap, we created a corpus of approximately 150,000 pairs of sentences. Six nurses identified segments corresponding to the normalized &#x201C;term expressions&#x201D; manually from the original case report sentences. This corpus was used to define clinically essential expressions in sentences as descriptions and definitions of nodes in the knowledge graph of J-CaseMap. This study aimed to extract these clinically meaningful expressions in sentences. Converting expressions from extracted sentences into normalized terminology was beyond the scope of this study.</p><p>The medical text corpus was used to fine-tune the encoder and decoder models. We used 0.56 GB of medical domain texts, which combined real-world clinical texts, such as discharge summaries (0.53 GB), and textbook-style medical texts, such as medical textbooks (0.03 GB), to construct a unique DeBERTa [<xref ref-type="bibr" rid="ref34">34</xref>] pretraining model and an additional pretraining model with complex medical knowledge.</p></sec><sec id="s2-3"><title>Annotation Procedure</title><p>To minimize the subjectivity inherent in the definition of &#x201C;clinically important expressions,&#x201D; we designed our annotation scheme on the basis of the J-CaseMap knowledge graph. For each case report, we first regarded as &#x201C;descriptions important for clinical reasoning&#x201D; those concepts that the physicians editing J-CaseMap had adopted as component nodes of the knowledge graph for that case (this node selection process itself involved multiple rounds of review by several physicians). We then constructed the corpus by marking, in the original case report text, the descriptions that corresponded to these node expressions. The annotation task was performed by 6 nurses, with 2 nurses independently annotating each case report. When their results did not agree, the 2 annotators discussed the discrepancies and revised the annotations until full consensus was reached. However, because we did not retain the preadjudication annotations, we could not compute the initial interannotator agreement (IAA). Accordingly, the resulting labels should be interpreted as an adjudicated consensus reference standard rather than an objective ground truth. Thus, our evaluation primarily quantifies agreement with this consensus, and some apparent &#x201C;errors&#x201D; may reflect clinically acceptable alternative span selections rather than model failures.</p></sec><sec id="s2-4"><title>Creating Correct Answer Label Data</title><p><xref ref-type="fig" rid="figure3">Figure 3</xref> illustrates the process for generating correct label data and handling input and output for the model.</p><fig position="float" id="figure3"><label>Figure 3.</label><caption><p>Creation of gold-standard labels and model inputs for instruction tuning versus token classification. The example sentence &#x201C;&#x611F;&#x67D3;&#x6027;&#x5FC3;&#x5185;&#x819C;&#x708E; (IE&#xFF09;&#x306F;&#x305D;&#x306E;&#x7D4C;&#x904E;&#x201D; (&#x201C;The course of infective endocarditis (IE)&#x201D;) is taken from a Japanese clinical case report. In the instruction tuning setting (left), clinically important concepts within the sentence are enclosed by special markers (@@ and ##) in the output text. In the token classification setting (right), the same sentence is split into characters, and binary labels are assigned to each character (1 for characters within clinically essential expressions and 0 otherwise), which are then mapped to tokens based on the tokenization. Character-level labels are used to calculate the weighted soft matching score, enabling fair comparison across models with different tokenizers.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v28i1e78681_fig03.png"/></fig><p>In instruction tuning, following Wang et al [<xref ref-type="bibr" rid="ref23">23</xref>] and Keloth et al [<xref ref-type="bibr" rid="ref28">28</xref>], &#x2460; case report sentences are provided as input to the models, and &#x2461; important clinical concepts are labeled by enclosing them with @@ and ## markers. These marked expressions are then extracted, and performance evaluation is conducted after splitting the text into characters (&#x2462;). We enabled byte fallback to ensure robust handling of out-of-vocabulary characters.</p><p>In token classification, &#x2460; case report sentences are split into characters (&#x2462;); a label of &#x201C;1&#x201D; is assigned to character strings identified as critical clinical concepts, and &#x201C;0&#x201D; is assigned to all other character strings. The case report text is then tokenized and labeled with &#x201C;1&#x201D; for tokens representing important clinical concepts and &#x201C;0&#x201D; for all other tokens. At this point, if the byte fallback function of each model is enabled, expressions that are not in the vocabulary set of the tokenizer are subdivided, making it difficult to label them (eg, &#x201C;&#x86D4;&#x2F8D;(<italic>Ascaris</italic>)&#x201D; &#x2192; &#x201C;&#x2018;&#x003C;0xE8&#x003E;&#x2019;, &#x2018;&#x003C;0x9B&#x003E;&#x2019;, &#x2018;&#x003C;0x94&#x003E;&#x2019;, &#x2018;&#x866B;&#x2019;&#x201D;). Notably, the encoder models used in this study originally had byte fallback disabled by default. Accordingly, for consistency across models and to avoid label fragmentation, the byte fallback function was disabled in all encoder and decoder models used for token classification, and unknown tokens were allowed. Next, the case report sentences are input to multiple models, and the labels after step &#x2463; represent the model outputs. Performance evaluation is conducted on a character-by-character basis.</p><p>Performance was evaluated on the labels generated after the aforementioned steps for both tasks. The performance evaluation used a weighted soft matching score inspired by the H&#x00F6;lder mean.</p><disp-formula id="equWL1"><mml:math id="eqn1"><mml:msup><mml:mrow><mml:mfenced separators="|"><mml:mrow><mml:mrow><mml:msubsup><mml:mo stretchy="false">&#x2211;</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>m</mml:mi></mml:mrow></mml:msubsup><mml:mrow><mml:msup><mml:mrow><mml:mfenced separators="|"><mml:mrow><mml:mfrac><mml:mrow><mml:msub><mml:mrow><mml:mi>a</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mrow><mml:mi>l</mml:mi></mml:mrow></mml:mfrac></mml:mrow></mml:mfenced></mml:mrow><mml:mrow><mml:mi>p</mml:mi></mml:mrow></mml:msup></mml:mrow></mml:mrow></mml:mrow></mml:mfenced></mml:mrow><mml:mrow><mml:mfrac><mml:mrow><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>p</mml:mi></mml:mrow></mml:mfrac></mml:mrow></mml:msup></mml:math></disp-formula><p>For each extracted entity, <italic>l</italic> is the length of the entity string, <italic>m</italic> is the number of markers assigned to that entity, and <italic>a</italic><sub><italic>i</italic></sub> is the string length of the <italic>i</italic>-th marker. Finally, <italic>p</italic> represents the penalty strength.</p><p>This evaluation metric imposes a penalty on the extraction process when segmented parts are present. Each extraction target was scored using this weighted soft matching score; furthermore, considering that extracting longer targets is more challenging, the scores were aggregated using a length-weighted average based on the target length. This assigns relatively higher scores to more challenging extractions. This evaluation metric is designed such that when <italic>p</italic>=1, it corresponds to a character-level evaluation. As <italic>p</italic> increases, there is a penalty for the extracted part if there is a split in the extracted marker, and when <italic>p=&#x221E;</italic>, it converges to the ratio of the longest extracted part in the extracted target.</p><p>Performance was also assessed on a marker-by-marker basis (marker matching score)&#x2014;a simple average that does not account for target length&#x2014;as well as on token-split and character-split bases.</p><p>Results are presented in Tables S1-S3 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>. In this study, the number of extraction targets differed for each case; therefore, we evaluated all target extraction markers contained in multiple instances in the test set. Case-specific performance evaluations are presented in Tables S4-S7 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p><p>Using the aforementioned evaluation metrics, we computed true positives, false positives, and false negatives, focusing on the target extraction spans, and then calculated the <italic>F</italic><sub>1</sub>-score from these values. We did not count true negatives for strings outside the extraction targets; therefore, even if instruction tuning produces outputs that differ from the source text in nonextraction spans, such differences do not affect the evaluation.</p></sec><sec id="s2-5"><title>Training Settings</title><p>Token classification and instruction tuning were conducted on the corpus of important clinical concepts using the following models:</p><list list-type="bullet"><list-item><p>General-purpose models pretrained on Japanese text include:</p><list list-type="bullet"><list-item><p>RoBERTa [<xref ref-type="bibr" rid="ref35">35</xref>] (rinna/japanese-roberta-base; hereinafter RoBERTa-base)</p></list-item><list-item><p>DeBERTa (izumi-lab/deberta-v2-base-japanese; hereinafter DeBERTaV2-base)</p></list-item><list-item><p>GPT2 [<xref ref-type="bibr" rid="ref36">36</xref>] (llm-jp/llm-jp-13b-v1.0, llm-jp/llm-jp-13binstruct-full dolly_en-dolly_ja-ichikara_003_001-oasst_en-oasst_ja-v1.1; hereinafter LLM-jp-13B v1.0)</p></list-item><list-item><p>GPT-NeoX [<xref ref-type="bibr" rid="ref37">37</xref>] (rinna/japanese-gpt-neox-3.6b, rinna/ japanese-gpt-neox-3.6b-instruction-sft-v2; hereinafter GPT-NeoX-3.6B)</p></list-item></list></list-item><list-item><p>Domain-specific models pretrained on medical texts include:</p><list list-type="bullet"><list-item><p>RoBERTa (alabnii/jmedroberta-base-sentencepiecevocab50000; hereinafter JMedRoBERTa)</p></list-item><list-item><p>DeBERTa pretrained on original medical texts (JMedDeBERTa(s))</p></list-item></list></list-item><list-item><p>Models additionally pretrained on general-purpose Japanese text include:</p><list list-type="bullet"><list-item><p>LLaMA2 [<xref ref-type="bibr" rid="ref38">38</xref>]: (tokyotech-llm/Swallow-7bhf, tokyotech-llm/Swallow-7b-instruct-v0.1, tokyotech-llm/Swallow-13b-hf, tokyotech-llm/ Swallow-13b-instruct-v0.1, tokyotech-llm/Swallow-70b-hf, tokyotech-llm/Swallow-70b-instruct-v0.1, hereinafter Swallow-7B, Swallow-13B, and Swallow-70B)</p></list-item><list-item><p>Mistral [<xref ref-type="bibr" rid="ref39">39</xref>]: (tokyotech-llm/Swallow-MS-7b-v0.1, tokyotech-llm/Swallow-MS-7b-instruct-v0.1, hereinafter Swallow-MS-7B)</p></list-item></list></list-item><list-item><p>Models further pretrained on medical texts include:</p><list list-type="bullet"><list-item><p>DeBERTa (izumi-lab/deberta-v2-base-japanese) with additional pretraining on medical texts (hereinafter JMedDeBERTa(c))</p></list-item></list></list-item></list><p>In the list mentioned earlier, the descriptions in parentheses represent the model identifiers on Hugging Face, the largest data-sharing platform for LLMs.</p><p>Prior research has reported that even slight modifications in prompt design can markedly alter model performance, indicating that developing effective prompts requires domain-specific expertise [<xref ref-type="bibr" rid="ref2">2</xref>]. On the basis of the studies by Wang et al [<xref ref-type="bibr" rid="ref23">23</xref>] and Keloth et al [<xref ref-type="bibr" rid="ref28">28</xref>], we designed the prompt for instruction tuning as follows: &#x201C;&#x4EE5;&#x4E0B;&#x306F;, &#x30BF;&#x30B9;&#x30AF;&#x3092;&#x8AAC;&#x660E;&#x3059;&#x308B;&#x6307;&#x793A;&#x3068;, &#x6587;&#x8108;&#x306E;&#x3042;&#x308B;&#x5165;&#x529B;&#x306E;&#x7D44;&#x307F;&#x5408;&#x308F;&#x305B;&#x3067;&#x3059;&#x3002;&#x201D; &#x201C;&#x8981;&#x6C42;&#x3092;&#x9069;&#x5207;&#x306B;&#x6E80;&#x305F;&#x3059;&#x5FDC;&#x7B54;&#x3092;&#x66F8;&#x304D;&#x306A;&#x3055;&#x3044;&#x3002;\n\n&#x201D; &#x201C;### &#x6307;&#x793A;:\n&#x4E0E;&#x3048;&#x3089;&#x308C;&#x305F;&#x6587;&#x306B;&#x5BFE;&#x3057;&#x3066;&#x91CD;&#x8981;&#x81E8;&#x5E8A;&#x6982;&#x5FF5;&#x3092;@@&#x3068;##&#x3067;&#x5F37;&#x8ABF;&#x8868;&#x793A;&#x3057;, &#x62BD;&#x51FA;&#x3057;&#x3066;&#x304F;&#x3060;&#x3055;&#x3044;&#x3002;&#x201D; &#x201C;&#x91CD;&#x8981;&#x81E8;&#x5E8A;&#x6982;&#x5FF5;&#x304C;&#x5B58;&#x5728;&#x3057;&#x306A;&#x3044;&#x5834;&#x5408;&#x306F;&#x540C;&#x3058;&#x6587;&#x3092;&#x51FA;&#x529B;&#x3057;&#x3066;&#x304F;&#x3060;&#x3055;&#x3044;&#x3002;\n\n### &#x5165;&#x529B;:{input}\n\n### &#x5FDC;&#x7B54;:&#x201D; (<italic>&#x201C;The following is a combination of instructions describing a task and contextual input.&#x201D; &#x201C;Write a response that appropriately fulfills the request.&#x201D; &#x201C;###Instruction:\n For the given sentence, highlight the key clinical concepts using @@ and ##, and extract them.&#x201D; &#x201C;If there are no key clinical concepts, output the same sentence as it is. \n\n### Input: {input}\n\n### Response:&#x201D;</italic>) After fine-tuning, we compared these 17 models.</p></sec><sec id="s2-6"><title>Constructing the Pretraining and Additional Pretraining Models Using Medical Texts</title><p>As shown in <xref ref-type="fig" rid="figure4">Figure 4</xref>, 0.56 GB of medical domain text was used to construct JMedDeBERTa(s) and JMedDeBERTa(c). When constructing JMedDeBERTa(s), SentencePiece [<xref ref-type="bibr" rid="ref40">40</xref>] was used as the tokenizer.</p><fig position="float" id="figure4"><label>Figure 4.</label><caption><p>Construction of JMedDeBERTa(s) and JMedDeBERTa(c) using Japanese medical texts. JMedDeBERTa(s) is an encoder model with the DeBERTaV2-base architecture pretrained from scratch on 0.56 GB of Japanese medical text (0.53 GB clinical documents such as discharge summaries and 0.03 GB medical textbooks), then fine-tuned on the extended Clinical Concept Recognition (E-CCR) data set derived from J-CaseMap. JMedDeBERTa(c) starts from a general-domain DeBERTaV2-base model pretrained on 362 GB of mixed-domain Japanese text, continues pretraining on the same 0.56 GB medical corpus, and is then fine-tuned on the same E-CCR data set.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v28i1e78681_fig04.png"/></fig><p>For JMedDeBERTa(s) and JMedDeBERTa(c), pretraining and additional pretraining were conducted on 8 V100 GPUs (16 GB each). The medical corpus was split into training and validation sets at a ratio of 0.8:0.2, and the models were trained using the hyperparameters listed in Table S8 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>. The effective global batch size was 2048 sequences (32 per GPU&#x00D7;8 GPUs, with gradient accumulation of 8). Under this setting, the maximum of 60 training epochs corresponded to 81,960 optimizer update steps (1366 updates per epoch). Therefore, the learning curves in Figures S1 and S2 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> represent the full 60-epoch training run, plotted up to approximately 80,000 update steps. In general, substantially larger datasets are required to learn robust language representations. As our domain corpus is relatively small, we additionally monitored the masked language modeling (MLM) loss for both JMedDeBERTa(s) and JMedDeBERTa(c) to verify that training proceeded appropriately.</p></sec><sec id="s2-7"><title>Fine-Tuning</title><sec id="s2-7-1"><title>Overview</title><p><xref ref-type="fig" rid="figure5">Figure 5</xref> lists the 17 comparison models used for fine-tuning, and <xref ref-type="table" rid="table1">Table 1</xref> provides an overview of the various models. For fine-tuning, the corpus of clinically significant concepts was divided into training, validation, and test sets at a ratio of 0.6:0.2:0.2, and 5-fold cross-validation was performed for 10 epochs (with early stopping). The 70B decoder model could be trained on an NVIDIA H100 (80 GB) GPU, while all other models were trainable on a single NVIDIA A6000 (48 GB) GPU. The hyperparameters for the encoder and decoder models were selected based on settings used in prior studies on NER tasks [<xref ref-type="bibr" rid="ref28">28</xref>,<xref ref-type="bibr" rid="ref29">29</xref>] and are detailed in Table S9 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>. Fine-tuning performance metrics included F1 score, recall, and precision, evaluated using the weighted soft matching score. Due to computational resource constraints, we applied low-rank adaptation (LoRA) to the decoder models and full fine-tuning to the encoder models. This choice was motivated by prior work showing that LoRA can achieve performance comparable to full fine-tuning [<xref ref-type="bibr" rid="ref41">41</xref>].</p><fig position="float" id="figure5"><label>Figure 5.</label><caption><p>Overview of the 17 encoder and decoder language models compared in this study. All models were fine-tuned to extract clinically important concepts from Japanese case reports in the J-CaseMap database and evaluated on the E-CCR task. Decoder models (LLM-jp-13B v1.0, GPT-NeoX-3.6B, Swallow-7B/13B/70B, Swallow-MS-7B) were fine-tuned by instruction tuning or token classification, while encoder models (RoBERTa-base, DeBERTaV2-base, JMedRoBERTa, JMedDeBERTa(s), JMedDeBERTa(c)) were fine-tuned by token classification only. All models used identical data splits and evaluation metrics.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v28i1e78681_fig05.png"/></fig><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Characteristics of the 17 encoder and decoder language models evaluated on the extended clinical concept recognition task in Japanese clinical case reports.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Models<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup></td><td align="left" valign="bottom">Architecture</td><td align="left" valign="bottom">Parameters</td><td align="left" valign="bottom">Pretraining methods</td><td align="left" valign="bottom">Pretraining corpus type</td><td align="left" valign="bottom">Pretraining corpus size</td><td align="left" valign="bottom">Vocab size</td></tr></thead><tbody><tr><td align="left" valign="top">Decoder models</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>GPT-NeoX-3.6B</td><td align="left" valign="top">GPT-NeoX-3.6B</td><td align="left" valign="top">3.6B</td><td align="left" valign="top">From scratch</td><td align="left" valign="top">General (Japanese)</td><td align="left" valign="top">312.5B tokens</td><td align="left" valign="top">32,000</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>LLM-jp-13B v1.0</td><td align="left" valign="top">GPT2-13B</td><td align="left" valign="top">13B</td><td align="left" valign="top">From scratch</td><td align="left" valign="top">General (Japanese)</td><td align="left" valign="top">300B tokens</td><td align="left" valign="top">50,570</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Swallow-7B</td><td align="left" valign="top">LLaMA2-7B</td><td align="left" valign="top">7B</td><td align="left" valign="top">Continual learning</td><td align="left" valign="top">English+general (Japanese)</td><td align="left" valign="top">+100B tokens</td><td align="left" valign="top">43,176</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Swallow-13B</td><td align="left" valign="top">LLaMA2-13B</td><td align="left" valign="top">13B</td><td align="left" valign="top">Continual learning</td><td align="left" valign="top">English+general (Japanese)</td><td align="left" valign="top">+100B tokens</td><td align="left" valign="top">43,176</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Swallow-70B</td><td align="left" valign="top">LLaMA2-70B</td><td align="left" valign="top">70B</td><td align="left" valign="top">Continual learning</td><td align="left" valign="top">English+general (Japanese)</td><td align="left" valign="top">+100B tokens</td><td align="left" valign="top">43,176</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Swallow-MS-7B</td><td align="left" valign="top">Mistral-7B</td><td align="left" valign="top">7B</td><td align="left" valign="top">Continual learning</td><td align="left" valign="top">English+general (Japanese)</td><td align="left" valign="top">+105B tokens</td><td align="left" valign="top">42,800</td></tr><tr><td align="left" valign="top">Encoder models</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>DeBERTaV2-base</td><td align="left" valign="top">DeBERTa V2-base</td><td align="left" valign="top">110M</td><td align="left" valign="top">Continual learning</td><td align="left" valign="top">General+finance</td><td align="left" valign="top">357 GB+5.2 GB</td><td align="left" valign="top">32,000</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>RoBERTa-base</td><td align="left" valign="top">RoBERTa-base</td><td align="left" valign="top">110M</td><td align="left" valign="top">From scratch</td><td align="left" valign="top">General (Japanese)</td><td align="left" valign="top">75 GB</td><td align="left" valign="top">32,000</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>JMedRoBERTa</td><td align="left" valign="top">RoBERTa-base</td><td align="left" valign="top">124M</td><td align="left" valign="top">From scratch</td><td align="left" valign="top">Medical (Japanese)</td><td align="left" valign="top">1.8 GB</td><td align="left" valign="top">50,000</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>JMedDeBERTa(s)</td><td align="left" valign="top">DeBERTa V2-base</td><td align="left" valign="top">125M</td><td align="left" valign="top">From scratch</td><td align="left" valign="top">Medical (Japanese, this study)</td><td align="left" valign="top">0.56 GB</td><td align="left" valign="top">32,000</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>JMedDeBERTa(c)</td><td align="left" valign="top">DeBERTa V2- base</td><td align="left" valign="top">110M</td><td align="left" valign="top">Continual learning</td><td align="left" valign="top">General+medical (this study)</td><td align="left" valign="top">362 GB (general)+0.56 GB (medical)</td><td align="left" valign="top">32,000</td></tr></tbody></table><table-wrap-foot><fn id="table1fn1"><p><sup>a</sup> Models are grouped by architecture (decoder or encoder) and described by parameter count, pretraining methods, pretraining corpus type and size, and vocabulary size. All models were subsequently fine-tuned on the data set constructed from the J-CaseMap database.</p></fn></table-wrap-foot></table-wrap><p>For decoder-only models in the token classification setting, we retained the default causal (left-to-right) self-attention behavior used during pretraining; thus, the hidden state at position t could attend only to tokens at positions &#x2264;t. In implementation, we used only the standard padding attention mask to ignore padded tokens and did not modify the causal mask or the attention pattern (eg, no full unmasking, no layer-wise causal mask removal, and no other bidirectionality-enabling modification) [<xref ref-type="bibr" rid="ref42">42</xref>,<xref ref-type="bibr" rid="ref43">43</xref>]. We attached a token-level classification head to the final-layer hidden states and computed label logits for all token positions in parallel from a single forward pass under this causal mask.</p></sec><sec id="s2-7-2"><title>Pretraining Methods</title><p>&#x201C;From scratch&#x201D; indicates that the base model was pretrained from random initialization on the corpus listed under &#x201C;pretraining corpus type/size.&#x201D; &#x201C;Continual learning&#x201D; indicates that the base model was initialized from an existing pretrained checkpoint and further pretrained on additional corpora (eg, Swallow models continued training on approximately 100 billion tokens of English plus general Japanese text following LLaMA2 or Mistral; JMedDeBERTa(c) continued training on 0.56 GB of medical text after DeBERTaV2-base).</p></sec><sec id="s2-7-3"><title>Pretraining Corpus Type</title><p>&#x201C;General&#x201D; indicates general-domain Japanese text; &#x201C;medical&#x201D; indicates the 0.56 GB of Japanese medical documents used in this study; &#x201C;general+finance&#x201D; indicates a mixture of general and financial text; and &#x201C;general+medical&#x201D; indicates the combination of the general corpus used for DeBERTaV2-base and the 0.56 GB medical corpus used for JMedDeBERTa(c).</p></sec></sec><sec id="s2-8"><title>External Evaluation on a Public Clinical NER Dataset</title><p>To assess the reproducibility and generalizability of our findings using a public benchmark, we conducted an additional evaluation on Medical Report Named Entity Recognition for positive disease (MRNER disease), a Japanese clinical NER dataset used in JmedBench [<xref ref-type="bibr" rid="ref44">44</xref>]. MRNER disease comprises 100 clinical documents (50 case reports and 50 radiology reports) and 810 annotated entities. The task focuses on extracting relatively short disease names actually observed in patients, corresponding to a conventional clinical NER setting rather than our E-CCR task, which targets longer, diagnostically important phrases.</p><p>Although the number of annotated entities is relatively small for training large models, we considered the dataset sufficient for benchmark evaluation. We applied the same preprocessing, fine-tuning procedures, and evaluation framework as in the main analysis. Specifically, all 17 models were fine-tuned on MRNER disease and evaluated using the weighted soft matching score with fragmentation penalty parameter <italic>p</italic>=1, 1.5, 2, 100. Detailed scores for each model (<italic>F</italic><sub>1</sub>-score, recall, and precision at each <italic>p</italic>) are reported in Tables S10-S17 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p></sec><sec id="s2-9"><title>Ethical Considerations</title><p>In this study, to protect patient privacy, all personally identifiable information was removed from the datasets used. The case reports of the Japanese Society of Internal Medicine used to construct the J-CaseMap database are abstracts of case reports presented at regional meetings of the Society that do not contain personal information, and they have already been made publicly available on the web; therefore, no additional ethical review was required. In contrast, for the development of the language model using clinical record texts, to protect patient privacy, we used data from which all traceable personally identifiable information had been removed from the analysis dataset, and the study was conducted in accordance with the Declaration of Helsinki with the approval of the Ethics Committee of the Faculty of Medicine and Graduate School of Medicine, The University of Tokyo (approval 2018-NI). As this study analyzed only anonymized case report texts (secondary-use data) and involved no direct intervention with or interaction toward human subjects, the ethics committee determined that neither individual informed consent from patients nor financial compensation was required. Neither this article nor the supplementary materials contains any images or other materials that could be used to identify individual patients or clinicians.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>Overview</title><p><xref ref-type="table" rid="table2">Table 2</xref> summarizes the average results from 5-fold cross-validation of the 17 comparison models. Tables S1-S3 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> display the average results for markers, characters, and tokens across all cases. Tables S4-S7 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> show average results for weighted soft matching scores, marker matching scores, characters, and tokens for each case. Model performance declined as the penalty for splits in extracted parts increased. When comparing the results for <italic>p=1</italic> and <italic>p</italic>=100, the decoder model GPT-NeoX-3.6B was impacted, with an <italic>F</italic><sub>1</sub>-score reduction of &#x2212;0.047.</p><p>For other decoder models using token classification, the <italic>F</italic><sub>1</sub>-score ranged from &#x2212;0.042 to &#x2212;0.046. Encoder model <italic>F</italic><sub>1</sub>-scores ranged from &#x2212;0.032 to &#x2212;0.036. When using instruction tuning, the <italic>F</italic><sub>1</sub>-score for all decoder models ranged from &#x2212;0.022 to &#x2212;0.030.</p><p>Regardless of the penalty size, the encoder model using token classification, that is, JMedDeBERTa(s), achieved the highest mean scores, while the top encoder models showed very similar performance. The decoder model using instruction tuning, that is, GPT-NeoX-3.6B, showed the worst results. Among the decoder models, Swallow-70B, which exhibited the best results for instruction tuning and token classification, could not produce better results than the encoder models.</p><p>Next, we compared and verified the results for <italic>p</italic>=1, which performed the best. Among all the models, the encoder model using token classification, that is, JMedDeBERTa(s), achieved the highest performance: <italic>mean F</italic><sub>1</sub>-score 0.758 (SD 0.002), mean recall 0.768 (SD 0.011), and mean precision 0.749 (SD 0.009). In contrast, the decoder model GPT-NeoX-3.6B, under instruction tuning, recorded the lowest performance: mean <italic>F</italic><sub>1</sub>-score 0.621 (SD 0.023), mean recall 0.582 (SD 0.044), and mean precision 0.669 (SD 0.006). The encoder model JMedDeBERTa(s), with 125 million parameters, is approximately 1/70th the size of Swallow-70B with 70 billion parameters. The decoder model Swallow-70B, under instruction tuning, achieved mean <italic>F</italic><sub>1</sub>-score 0.713 (SD 0.016), mean recall 0.680 (SD 0.035), and mean precision 0.751 (SD 0.020). The decoder model using token classification, that is, Swallow-70B, achieved mean <italic>F</italic><sub>1</sub>-score 0.739 (SD 0.005), mean recall 0.744 (SD 0.013), and precision mean 0.734 (SD 0.003). The metric scores of JMedDeBERTa(s) were better than those of Swallow-70B by margins of <italic>F</italic><sub>1</sub>-score=+0.045 and recall=+0.088 for instruction tuning and <italic>F</italic><sub>1</sub>-score=+0.020 and recall=+0.024 for token classification.</p><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Performance of 17 encoder and decoder models in terms of weighted soft matching score on the J-CaseMap test set.<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup></p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom" rowspan="2">Models<sup><xref ref-type="table-fn" rid="table2fn2">b</xref></sup></td><td align="left" valign="bottom" colspan="3"><italic>p</italic>=1, mean (SD)</td><td align="left" valign="bottom" colspan="3"><italic>p</italic>=1.5, mean (SD)</td><td align="left" valign="bottom" colspan="3"><italic>p</italic>=2, mean (SD)</td><td align="left" valign="bottom" colspan="3"><italic>p</italic>=100, mean (SD)</td></tr><tr><td align="left" valign="bottom"><italic>F</italic><sub>1</sub>-score</td><td align="left" valign="bottom">Recall</td><td align="left" valign="bottom">Precision</td><td align="left" valign="bottom"><italic>F</italic><sub>1</sub>-score</td><td align="left" valign="bottom">Recall</td><td align="left" valign="bottom">Precision</td><td align="left" valign="bottom"><italic>F</italic><sub>1</sub>-score</td><td align="left" valign="bottom">Recall</td><td align="left" valign="bottom">Precision</td><td align="left" valign="bottom"><italic>F</italic><sub>1</sub>-score</td><td align="left" valign="bottom">Recall</td><td align="left" valign="bottom">Precision</td></tr></thead><tbody><tr><td align="left" valign="top">Decoder models</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Instruction tuning</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>GPT-Neox-3.6B</td><td align="left" valign="top">0.621<break/>(0.023)</td><td align="left" valign="top">0.582<break/>(0.044)</td><td align="left" valign="top">0.669<break/>(0.006)</td><td align="left" valign="top">0.605<break/>(0.022)</td><td align="left" valign="top">0.566<break/>(0.042)</td><td align="left" valign="top">0.651<break/>(0.005)</td><td align="left" valign="top">0.599<break/>(0.022)</td><td align="left" valign="top">0.560<break/>(0.042)</td><td align="left" valign="top">0.645<break/>(0.005)</td><td align="left" valign="top">0.591<break/>(0.022)</td><td align="left" valign="top">0.553<break/>(0.041)</td><td align="left" valign="top">0.637<break/>(0.005)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>LLM-jp-13B v1.0</td><td align="left" valign="top">0.673<break/>(0.011)</td><td align="left" valign="top">0.602<break/>(0.022)</td><td align="left" valign="top"><italic>0.763</italic><break/><italic>(0.007)</italic><sup><xref ref-type="table-fn" rid="table2fn3">c</xref></sup></td><td align="left" valign="top">0.660<break/>(0.011)</td><td align="left" valign="top">0.588<break/>(0.021)</td><td align="left" valign="top"><italic>0.753</italic><break/><italic>(0.008)</italic><sup><xref ref-type="table-fn" rid="table2fn3">c</xref></sup></td><td align="left" valign="top">0.656<break/>(0.011)</td><td align="left" valign="top">0.584<break/>(0.021)</td><td align="left" valign="top"><italic>0.750</italic><break/><italic>(0.008)</italic><sup><xref ref-type="table-fn" rid="table2fn3">c</xref></sup></td><td align="left" valign="top">0.651<break/>(0.011)</td><td align="left" valign="top">0.577<break/>(0.021)</td><td align="left" valign="top"><italic>0.746</italic><break/><italic>(0.008)</italic><sup><xref ref-type="table-fn" rid="table2fn3">c</xref></sup></td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Swallow-7B</td><td align="left" valign="top">0.697<break/>(0.009)</td><td align="left" valign="top">0.667<break/>(0.037)</td><td align="left" valign="top">0.732<break/>(0.027)</td><td align="left" valign="top">0.681<break/>(0.007)</td><td align="left" valign="top">0.652<break/>(0.037)</td><td align="left" valign="top">0.717<break/>(0.032)</td><td align="left" valign="top">0.676<break/>(0.006)</td><td align="left" valign="top">0.647<break/>(0.037)</td><td align="left" valign="top">0.712<break/>(0.034)</td><td align="left" valign="top">0.670<break/>(0.005)</td><td align="left" valign="top">0.640<break/>(0.037)</td><td align="left" valign="top">0.706<break/>(0.036)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Swallow-13B</td><td align="left" valign="top">0.706<break/>(0.010)</td><td align="left" valign="top">0.668<break/>(0.024)</td><td align="left" valign="top">0.749<break/>(0.016)</td><td align="left" valign="top">0.692<break/>(0.011)</td><td align="left" valign="top">0.652<break/>(0.024)</td><td align="left" valign="top">0.739<break/>(0.019)</td><td align="left" valign="top">0.688<break/>(0.011)</td><td align="left" valign="top">0.647<break/>(0.023)</td><td align="left" valign="top">0.735<break/>(0.020)</td><td align="left" valign="top">0.682<break/>(0.011)</td><td align="left" valign="top">0.640<break/>(0.023)</td><td align="left" valign="top">0.731<break/>(0.021)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Swallow-70B</td><td align="left" valign="top">0.713<break/>(0.016)</td><td align="left" valign="top">0.680<break/>(0.035)</td><td align="left" valign="top">0.751<break/>(0.020)</td><td align="left" valign="top">0.699<break/>(0.015)</td><td align="left" valign="top">0.665<break/>(0.034)</td><td align="left" valign="top">0.738<break/>(0.023)</td><td align="left" valign="top">0.694<break/>(0.015)</td><td align="left" valign="top">0.660<break/>(0.034)</td><td align="left" valign="top">0.734<break/>(0.024)</td><td align="left" valign="top">0.688<break/>(0.015)</td><td align="left" valign="top">0.653<break/>(0.034)</td><td align="left" valign="top">0.730<break/>(0.025)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Swallow-MS-7B</td><td align="left" valign="top">0.693<break/>(0.018)</td><td align="left" valign="top">0.648<break/>(0.046)</td><td align="left" valign="top">0.747<break/>(0.022)</td><td align="left" valign="top">0.679<break/>(0.017)</td><td align="left" valign="top">0.633<break/>(0.044)</td><td align="left" valign="top">0.735<break/>(0.024)</td><td align="left" valign="top">0.675<break/>(0.017)</td><td align="left" valign="top">0.628<break/>(0.043)</td><td align="left" valign="top">0.732<break/>(0.025)</td><td align="left" valign="top">0.669<break/>(0.017)</td><td align="left" valign="top">0.621<break/>(0.042)</td><td align="left" valign="top">0.727<break/>(0.026)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Token classification</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>GPTNeox-3.6B</td><td align="left" valign="top">0.725<break/>(0.004)</td><td align="left" valign="top">0.731<break/>(0.014)</td><td align="left" valign="top">0.719<break/>(0.007)</td><td align="left" valign="top">0.698<break/>(0.005)</td><td align="left" valign="top">0.683<break/>(0.014)</td><td align="left" valign="top">0.714<break/>(0.007)</td><td align="left" valign="top">0.689<break/>(0.005)</td><td align="left" valign="top">0.668<break/>(0.014)</td><td align="left" valign="top">0.713<break/>(0.008)</td><td align="left" valign="top">0.678<break/>(0.005)</td><td align="left" valign="top">0.649<break/>(0.014)</td><td align="left" valign="top">0.710<break/>(0.008)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>LLM-jp-13B v1.0</td><td align="left" valign="top">0.732<break/>(0.003)</td><td align="left" valign="top">0.742<break/>(0.017)</td><td align="left" valign="top">0.722<break/>(0.010)</td><td align="left" valign="top">0.707<break/>(0.004)</td><td align="left" valign="top">0.696<break/>(0.017)</td><td align="left" valign="top">0.718<break/>(0.010)</td><td align="left" valign="top">0.698<break/>(0.004)</td><td align="left" valign="top">0.681<break/>(0.016)</td><td align="left" valign="top">0.716<break/>(0.010)</td><td align="left" valign="top">0.688<break/>(0.004)</td><td align="left" valign="top">0.663<break/>(0.016)</td><td align="left" valign="top">0.714<break/>(0.010)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Swallow-7B</td><td align="left" valign="top">0.732<break/>(0.003)</td><td align="left" valign="top">0.733<break/>(0.010)</td><td align="left" valign="top">0.731<break/>(0.008)</td><td align="left" valign="top">0.706<break/>(0.004)</td><td align="left" valign="top">0.686<break/>(0.012)</td><td align="left" valign="top">0.727<break/>(0.009)</td><td align="left" valign="top">0.697<break/>(0.004)</td><td align="left" valign="top">0.671<break/>(0.013)</td><td align="left" valign="top">0.725<break/>(0.009)</td><td align="left" valign="top">0.686<break/>(0.005)</td><td align="left" valign="top">0.653<break/>(0.013)</td><td align="left" valign="top">0.723<break/>(0.009)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Swallow-13B</td><td align="left" valign="top">0.735<break/>(0.002)</td><td align="left" valign="top">0.739<break/>(0.013)</td><td align="left" valign="top">0.730<break/>(0.008)</td><td align="left" valign="top">0.710<break/>(0.003)</td><td align="left" valign="top">0.696<break/>(0.012)</td><td align="left" valign="top">0.725<break/>(0.008)</td><td align="left" valign="top">0.702<break/>(0.003)</td><td align="left" valign="top">0.683<break/>(0.012)</td><td align="left" valign="top">0.724<break/>(0.008)</td><td align="left" valign="top">0.692<break/>(0.003)</td><td align="left" valign="top">0.666<break/>(0.012)</td><td align="left" valign="top">0.721<break/>(0.009)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Swallow-70B</td><td align="left" valign="top">0.739<break/>(0.005)</td><td align="left" valign="top">0.744<break/>(0.013)</td><td align="left" valign="top">0.734<break/>(0.003)</td><td align="left" valign="top">0.714<break/>(0.006)</td><td align="left" valign="top">0.698<break/>(0.014)</td><td align="left" valign="top">0.730<break/>(0.003)</td><td align="left" valign="top">0.706<break/>(0.007)</td><td align="left" valign="top">0.684<break/>(0.015)</td><td align="left" valign="top">0.729<break/>(0.003)</td><td align="left" valign="top">0.695<break/>(0.007)</td><td align="left" valign="top">0.666<break/>(0.015)</td><td align="left" valign="top">0.727<break/>(0.003)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Swallow-MS-7B</td><td align="left" valign="top">0.736<break/>(0.002)</td><td align="left" valign="top">0.755<break/>(0.010)</td><td align="left" valign="top">0.719<break/>(0.008)</td><td align="left" valign="top">0.711<break/>(0.002)</td><td align="left" valign="top">0.709<break/>(0.009)</td><td align="left" valign="top">0.714<break/>(0.008)</td><td align="left" valign="top">0.703<break/>(0.002)</td><td align="left" valign="top">0.694<break/>(0.009)</td><td align="left" valign="top">0.712<break/>(0.008)</td><td align="left" valign="top">0.692<break/>(0.002)</td><td align="left" valign="top">0.676<break/>(0.009)</td><td align="left" valign="top">0.709<break/>(0.008)</td></tr><tr><td align="left" valign="top">Encoder models</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Token classification</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>DeBERTaV2-base</td><td align="left" valign="top">0.755<break/>(0.006)</td><td align="left" valign="top">0.761<break/>(0.015)</td><td align="left" valign="top">0.750<break/>(0.006)</td><td align="left" valign="top">0.738<break/>(0.007)</td><td align="left" valign="top">0.728<break/>(0.016)</td><td align="left" valign="top">0.747<break/>(0.006)</td><td align="left" valign="top">0.732<break/>(0.007)</td><td align="left" valign="top">0.718<break/>(0.016)</td><td align="left" valign="top">0.746<break/>(0.006)</td><td align="left" valign="top">0.724<break/>(0.008)</td><td align="left" valign="top">0.704<break/>(0.017)</td><td align="left" valign="top">0.744<break/>(0.006)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>RoBERTa-base</td><td align="left" valign="top">0.748<break/>(0.003)</td><td align="left" valign="top">0.748<break/>(0.013)</td><td align="left" valign="top">0.749<break/>(0.009)</td><td align="left" valign="top">0.729<break/>(0.003)</td><td align="left" valign="top">0.713<break/>(0.012)</td><td align="left" valign="top">0.746<break/>(0.009)</td><td align="left" valign="top">0.722<break/>(0.003)</td><td align="left" valign="top">0.701<break/>(0.012)</td><td align="left" valign="top">0.745<break/>(0.009)</td><td align="left" valign="top">0.714<break/>(0.003)</td><td align="left" valign="top">0.687<break/>(0.011)</td><td align="left" valign="top">0.743<break/>(0.009)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>JMedRoBERTa</td><td align="left" valign="top">0.756<break/>(0.003)</td><td align="left" valign="top"><italic>0.769</italic><break/><italic>(0.015)</italic><sup><xref ref-type="table-fn" rid="table2fn3">c</xref></sup></td><td align="left" valign="top">0.743<break/>(0.009)</td><td align="left" valign="top">0.735<break/>(0.003)</td><td align="left" valign="top">0.733<break/>(0.015)</td><td align="left" valign="top">0.737<break/>(0.010)</td><td align="left" valign="top">0.728<break/>(0.003)</td><td align="left" valign="top">0.722<break/>(0.016)</td><td align="left" valign="top">0.735<break/>(0.010)</td><td align="left" valign="top">0.719<break/>(0.004)</td><td align="left" valign="top">0.707<break/>(0.016)</td><td align="left" valign="top">0.732<break/>(0.011)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>JMedDeBERTa(s)</td><td align="left" valign="top"><italic>0.758</italic><break/><italic>(0.002)</italic><sup><xref ref-type="table-fn" rid="table2fn4">d</xref></sup><sup>,</sup><sup><xref ref-type="table-fn" rid="table2fn3">c</xref></sup></td><td align="left" valign="top">0.768<break/>(0.011)</td><td align="left" valign="top">0.749<break/>(0.009)</td><td align="left" valign="top"><italic>0.740</italic><break/><italic>(0.002)</italic><sup><xref ref-type="table-fn" rid="table2fn3">c</xref></sup></td><td align="left" valign="top"><italic>0.734</italic><break/><italic>(0.012)</italic><sup><xref ref-type="table-fn" rid="table2fn3">c</xref></sup></td><td align="left" valign="top">0.746<break/>(0.010)</td><td align="left" valign="top"><italic>0.734</italic><break/><italic>(0.002)</italic><sup><xref ref-type="table-fn" rid="table2fn3">c</xref></sup></td><td align="left" valign="top">0.724<break/>(0.012)<sup><xref ref-type="table-fn" rid="table2fn3">c</xref></sup></td><td align="left" valign="top">0.744<break/>(0.010)</td><td align="left" valign="top"><italic>0.726</italic><break/><italic>(0.002)</italic><sup><xref ref-type="table-fn" rid="table2fn3">c</xref></sup></td><td align="left" valign="top"><italic>0.710</italic><break/><italic>(0.012)</italic><sup><xref ref-type="table-fn" rid="table2fn3">c</xref></sup></td><td align="left" valign="top">0.743<break/>(0.010)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>JMedDeBERTa(c)</td><td align="left" valign="top">0.757<break/>(0.004)</td><td align="left" valign="top">0.764<break/>(0.014)</td><td align="left" valign="top">0.751<break/>(0.010)</td><td align="left" valign="top">0.739<break/>(0.004)</td><td align="left" valign="top">0.730<break/>(0.014)</td><td align="left" valign="top">0.748<break/>(0.010)</td><td align="left" valign="top">0.732<break/>(0.004)</td><td align="left" valign="top">0.719<break/>(0.014)</td><td align="left" valign="top">0.747<break/>(0.010)</td><td align="left" valign="top">0.724<break/>(0.004)</td><td align="left" valign="top">0.705<break/>(0.014)</td><td align="left" valign="top">0.746<break/>(0.010)<sup><xref ref-type="table-fn" rid="table2fn3">c</xref></sup></td></tr></tbody></table><table-wrap-foot><fn id="table2fn1"><p><sup>a</sup><italic>F</italic><sub>1</sub>-score, recall, and precision are shown for 4 conditions of the H&#x00F6;lder-type parameter <italic>p</italic> (1, 1.5, 2, 100), which controls the strength of the fragmentation penalty (<italic>p</italic>=1 corresponds to pure character-level evaluation; larger <italic>p</italic> penalizes split predictions of long target spans more heavily).</p></fn><fn id="table2fn2"><p><sup>b</sup>Model performance is calculated using the weighted soft matching score, evaluating all extraction targets (clinical concepts) in the test set collectively.</p></fn><fn id="table2fn3"><p><sup>c</sup>Highest score among the comparative models.</p></fn><fn id="table2fn4"><p><sup>d</sup>Highest performance.</p></fn></table-wrap-foot></table-wrap><p>Next, we compared the performance of the decoder models. In the instruction tuning task, Swallow-70B achieved the highest mean <italic>F</italic><sub>1</sub>-score 0.713 (SD 0.016) and mean recall 0.680 (SD 0.035), whereas GPT-NeoX-3.6B achieved the lowest mean <italic>F</italic><sub>1</sub>-score 0.621 (SD 0.023) and mean recall 0.582 (SD 0.044). In the token classification task, Swallow-70B achieved the highest mean <italic>F</italic><sub>1</sub>-score 0.739 (SD 0.005), and Swallow-MS-7B achieved the highest mean recall 0.755 (SD 0.010), whereas GPT-NeoX-3.6B achieved the lowest mean <italic>F</italic><sub>1</sub>-score 0.725 (SD 0.004) and mean recall 0.731(SD 0.014). In the LLAMA2 decoder model, increasing parameters from 7 billion to 70 billion enhanced performance: <italic>F</italic><sub>1</sub>-score +0.016 and recall +0.013 for the instruction tuning task, and <italic>F</italic><sub>1</sub>-score +0.007 and recall +0.010 for the token classification task.</p><p>Finally, we compared the various encoder models. JMedDeBERTa(s) achieved the numerically highest mean <italic>F</italic><sub>1</sub>-score (0.758, SD 0.002), and JMedRoBERTa achieved the highest mean recall (0.769, SD 0.015), whereas RoBERTa-base showed the lowest performance (mean <italic>F</italic><sub>1</sub>-score 0.748, SD 0.003; and mean recall 0.748, SD 0.013). Notably, among encoder models, the DeBERTa family models (JMedDeBERTa(s), JMedDeBERTa(c), and DeBERTaV2-base) achieved the highest mean scores. However, differences within this top tier were marginal (&#x0394;<italic>F</italic><sub>1</sub>&#x2264;0.003 at <italic>p</italic>=1) and should be interpreted as effectively comparable, given the estimated annotation uncertainty (4% case-level corrections in our sampled review). DeBERTaV2-base achieved a higher <italic>F</italic><sub>1</sub>-score (+0.007) than RoBERTa-base, and JMedDeBERTa(s) achieved a higher <italic>F</italic><sub>1</sub>-score (+0.003) than JMedRoBERTa. JMedRoBERTa achieved a higher <italic>F</italic><sub>1</sub>-score (+0.007) than RoBERTa-base.</p></sec><sec id="s3-2"><title>Performance on a Public Clinical NER Benchmark</title><p>Consistent with the E-CCR results on J-CaseMap, encoder models outperformed decoder models on the MRNER disease benchmark (Table S10 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). The best-performing encoder model was DeBERTaV2-base, which achieved an mean <italic>F</italic><sub>1</sub>-score of 0.794 (SD 0.018) at <italic>p</italic>=1, closely followed by JMedDeBERTa(c) (mean <italic>F</italic><sub>1</sub>-score 0.788, SD 0.016) and JMedDeBERTa(s) (mean <italic>F</italic><sub>1</sub>-score 0.788, SD 0.027). Among decoder models, LLM-jp-13B v1.0 in the token classification setting showed the highest performance (mean <italic>F</italic><sub>1</sub>-score 0.694, SD 0.029), whereas instruction-tuned GPT-NeoX-3.6B performed worst (mean <italic>F</italic><sub>1</sub>-score 0.373, SD 0.067). These results reinforce the main conclusion that, for Japanese clinical concept extraction, encoder-based token classification is consistently stronger than decoder-based approaches&#x2014;whether instruction-tuned or token classification models&#x2014;under the same span-level evaluation.</p></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Findings</title><p>A key finding of this study is that, under the experimental conditions, encoder models consistently outperformed decoder models; no decoder model outperformed any encoder model.</p><p>On the J-CaseMap dataset, JMedDeBERTa(s)&#x2014;an encoder model pretrained on domain-specific medical text&#x2014;achieved the highest performance (<italic>F</italic><sub>1</sub>-score=0.758, SD 0.002), whereas the instruction-tuned GPT-NeoX-3.6B produced the worst. While performance generally declined as the fragmentation penalty increased, no consistently severe degradation was observed. On the MRNER disease dataset, the general-domain DeBERTaV2-base yielded the highest mean <italic>F</italic><sub>1</sub>-score. The differences among the medical-domain JMedDeBERTa(s) and JMedDeBERTa(c) variants were minimal, suggesting that the benefit of domain-specific pretraining was limited in this context. Overall, under the evaluated instruction tuning setup (a single prompt template requiring inline @@/## boundary tags), token classification encoders were more effective than instruction-tuned decoders in our experiments. However, because the generative setting additionally required faithful text reproduction plus precise character-level boundary-tag placement, the observed performance gap may partly reflect the difficulty of this particular output representation rather than model architecture alone. We did not evaluate alternative prompting strategies (eg, structured span lists or JSON outputs) or reasoning-augmented prompting (eg, chain-of-thought), which could affect generative performance. Importantly, encoder-based token classification is nonautoregressive and can assign labels to all tokens in parallel (ie, in a single forward pass), whereas decoder-based extraction via instruction tuning typically relies on autoregressive generation, whose runtime scales with the number of generated tokens. This architectural difference is expected to yield lower and more predictable inference latency and higher throughput for encoder-based token classification, which is particularly relevant for time-sensitive clinical workflows. Although we did not benchmark wall-clock inference latency in this study, we highlight this implication as a practical advantage of encoder-based approaches. Moreover, the smaller memory footprint of encoder models can make on-premise deployment more feasible when handling sensitive clinical text.</p><p>If, due to hallucination, the decoder extracts a string that differs from the original only in trivial ways&#x2014;such as the insertion or deletion of a very small number of characters&#x2014;then counting such negligible discrepancies as correct could, in principle, yield higher practical accuracy. However, as shown in Table S18 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>, after removing @@/## from both the gold-set inputs and the model outputs, the 5-fold mean exact match rate was 98.40% (SD 0.42%). Moreover, inspection of the mismatched cases indicated that most differences were minor: considering mismatches only, the mean SequenceMatcher similarity was 0.993 (SD 0.002), the mean Levenshtein distance was 7.54 (SD 1.99), and the mean length difference was &#x2212;2.30 (SD 1.82). These findings suggest that even under a lenient assumption in which all such cases are treated as correct, the conclusion stated earlier would not change.</p><p>In recent years, the development of biomedical language models prioritized increasingly complex architectures and a rapid, often exponential growth in parameter count, in line with the consensus in LLM research [<xref ref-type="bibr" rid="ref45">45</xref>,<xref ref-type="bibr" rid="ref46">46</xref>]. However, some studies have reported that even a substantial increase in the parameter count of the decoder models does not necessarily improve model performance beyond that of encoder models [<xref ref-type="bibr" rid="ref23">23</xref>,<xref ref-type="bibr" rid="ref24">24</xref>], which is consistent with the findings of this study.</p><p>One possible explanation is that token classification constitutes a more constrained formulation of the prediction problem than free-form text generation via instruction tuning. Specifically, instead of having to localize the target span while generating fluent text, the model only needs to assign a binary label to each token. As the output space is thus more restricted, encoder models with far fewer parameters may plausibly compete with much larger decoder models. At the very least, a similar tendency was observed in both our main experiments and the additional experiments on the MRNER disease dataset. In our study, we demonstrated that even for tasks beyond conventional NER, such as E-CCR that targets longer phrases, encoder models with far fewer parameters are far more effective than decoder models. These findings offer valuable insight into model selection under practical constraints for general research institutions with limited computational resources, representing one of the substantial contributions of our work. Prior research has reported that decoder models require substantial computational resources and incur high costs, making them unsuitable when such resources are constrained [<xref ref-type="bibr" rid="ref2">2</xref>].</p><p>Our goal was to compare off-the-shelf encoder versus decoder backbones under commonly used fine-tuning recipes, without altering the pretrained attention mechanism. An additional factor contributing to the encoder-decoder gap is attention directionality. Encoder models compute token representations bidirectionally, whereas decoder-only models under the default causal mask cannot incorporate future tokens when predicting token-level labels. This limitation may particularly affect boundary decisions in span labeling. Prior work indicates that partially or fully relaxing the causal mask (eg, layer-wise causal mask removal) can markedly improve sequence labeling with decoder-only LLM [<xref ref-type="bibr" rid="ref42">42</xref>,<xref ref-type="bibr" rid="ref43">43</xref>]. Under the standard fine-tuning recipes considered in this study, encoder backbones outperform decoder-only backbones on token classification; however, relaxing the causal mask (eg, via layer-wise mask removal) may improve decoder-only models and help narrow this gap, which we leave for future work.</p><p>Second, a comparison of the evaluated decoder models showed that instruction-tuned variants consistently underperformed their token classification counterparts. This contrasts with prior work [<xref ref-type="bibr" rid="ref28">28</xref>], which suggested that decoder models can match or even surpass encoder models when supported by carefully designed prompts. Our results indicate that, even within decoder architectures, token classification is more effective than instruction tuning for extraction tasks. Although we did not exhaustively explore all prompt designs, the E-CCR task requires extracting entire phrases corresponding to single clinical concepts. This is substantially more complex than conventional NER, making effective prompt design for instruction tuning considerably more challenging. Overall, these findings suggest that token classification provides a more robust approach for specialized extraction scenarios across both encoder and decoder models.</p><p>Among the various token classification&#x2013;based encoder models investigated in this study, JMedDeBERTa(s) achieved the highest mean performance, whereas RoBERTa-base performed the worst. However, the differences among the top encoder models were very small and should be interpreted as comparable within the uncertainty implied by our annotation review. This is consistent with previous research [<xref ref-type="bibr" rid="ref34">34</xref>] and indicates that the DeBERTa architecture is more effective than the RoBERTa architecture. In particular, DeBERTa uses the disentangled attention mechanism that allows for a more precise interpretation of the importance of word positions and the contextual relationships among words, as well as a masked decoder that predicts not only the masked tokens but also their positions [<xref ref-type="bibr" rid="ref47">47</xref>,<xref ref-type="bibr" rid="ref48">48</xref>]. These mechanisms likely enhance the understanding of linguistic structure and context, resulting in high performance even on E-CCR tasks.</p><p>With respect to the effect of domain-specific text versus general text, no significant difference was detected; however, the highest accuracy was achieved by the model pretrained on domain-specific documents, followed by the model that underwent additional pretraining on a generally pretrained model, and finally by the model pretrained solely on general text. For example, among the DeBERTa models, JMedDeBERTa(s)&#x2014;pretrained on medical documents&#x2014;outperformed both DeBERTaV2-base (pretrained on general Japanese texts) and JMedDeBERTa(c), which was additionally pretrained on medical records. Although JMedDeBERTa(c) performed better than DeBERTaV2-base, it did not reach the performance level of JMedDeBERTa(s). Similarly, among the RoBERTa models, MedRoBERTa, which was pretrained on medical documents, outperformed RoBERTa-base, which was pretrained on general Japanese texts. However, the differences were minimal, indicating that the distinctions among these pretraining approaches are negligible.</p></sec><sec id="s4-2"><title>Comparison to Prior Work</title><p>This observation aligns with previous studies. Kim et al [<xref ref-type="bibr" rid="ref49">49</xref>] reported that a model pretrained on domain-specific Korean medical documents outperformed a general Korean language model. Conversely, Subies et al [<xref ref-type="bibr" rid="ref33">33</xref>] found that a model pretrained on domain-specific Spanish medical records did not surpass the performance of a model pretrained on general Spanish texts or that of a multilingual model. As with our findings, the differences in model comparison results in these studies were minimal.</p><p>Similar to the minimal performance differences reported by Kim et al [<xref ref-type="bibr" rid="ref49">49</xref>] and Subies et al [<xref ref-type="bibr" rid="ref33">33</xref>], the slight differences in performance observed in our study may be attributed to the quantity and content of the texts used for both pretraining and additional pretraining. Specifically, only 0.56 GB of medical text was used to train the domain-specific models JMedDeBERTa(c) and JMedDeBERTa(s), compared to the 362 GB of text used to train the general-corpus DeBERTaV2-base model.</p><p>Although the pretraining corpus is small compared with typical general-language pretraining datasets, the MLM loss curves for JMedDeBERTa(s) and JMedDeBERTa(c) (Figures S1 and S2 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>) indicate stable optimization and convergence under our setting. We note, however, that convergence of the MLM loss does not by itself guarantee broad linguistic coverage or robustness across institutions and document types. A possible reason why effective learning may be achievable with a relatively small corpus is the qualitative nature of discharge summaries: compared with fragmented note types (eg, brief progress notes), discharge summaries are often more narrative and syntactically coherent, potentially enabling MLM pretraining to capture clinically salient vocabulary and patterns efficiently. Our findings indicate that, at least for the E-CCR task examined in this study, even a model pretrained on a very small domain-specific corpus can achieve performance comparable to, or slightly exceeding, that of a model trained on a large general corpus. This highlights the high data efficiency of domain-specific pretraining in this setting: even a small amount of domain-specific text may be sufficient to obtain performance on par with a model trained on a much larger general corpus. In contrast, our results also indicate that, even when only a limited amount of domain-specific text is available, a model trained solely on general text may still achieve satisfactory performance on the E-CCR task. This is presumably because, in the E-CCR task, the syntactic characteristics of the text play a crucial role even when specialized terminology has not been thoroughly acquired. For example, even if the model has not explicitly learned each specialized term, when processing a sentence such as &#x201C;SAA was abnormally elevated to 1940 mg/mL, and FMF was suspected,&#x201D; it may still be able to extract spans such as &#x201C;SAA was abnormally elevated to 1940 mg/mL&#x201D; and &#x201C;FMF&#x201D; by using syntactic information as clues. Therefore, training strategies should be flexibly adjusted according to the amount and type of data available.</p></sec><sec id="s4-3"><title>External Benchmark and Implications for Domain-Specific Versus General Pretraining</title><p>Additional experiments using the MRNER disease dataset further support these conclusions. Even in this conventional clinical NER task, encoder models consistently outperformed decoder models; however, the general-domain DeBERTaV2-base slightly outperformed the domain-specific JMedDeBERTa(s) (eg, <italic>F</italic><sub>1</sub>=0.794, SD 0.018 vs 0.788, SD 0.027 at <italic>p</italic>=1). However, this difference was marginal and likely reflects multiple factors.</p><p>We interpret this result not as a lack of generalization ability in JMedDeBERTa(s) but primarily as a reflection of task mismatch and distribution shift. First, MRNER disease is a conventional clinical NER task targeting short disease name entities, whereas E-CCR, our primary evaluation target, focuses on longer, more complex, and clinically meaningful spans that often exceed typical NER boundaries. Therefore, the relative advantage of domain-specific versus general-domain pretraining can vary depending on specific task requirements. Additionally, distribution shifts arising from differences in document types, medical institutions, and annotation criteria may have also influenced these results.</p><p>The substantial disparity in the size and composition of pretraining data (362 GB vs 0.56 GB) further complicates this comparison. Although JMedDeBERTa(s) achieved the highest average performance on the E-CCR task, the differences among the top-tier encoder models were generally comparable within the margin of uncertainty suggested by our annotation review. Taken together, these findings suggest that while pretraining on a massive general-domain corpus provides a solid linguistic foundation, pretraining on a much smaller domain-specific corpus can achieve competitive performance&#x2014;comparable to or slightly exceeding general-domain models&#x2014;depending on the task constraints and linguistic characteristics of the target documents.</p><p>However, as the external benchmark consists of only 100 documents, these slight performance differences are not necessarily robust and should be interpreted with caution. Moreover, previous research has reported substantially low zero-shot and few-shot performance of pretrained LLMs on MRNER disease (entity-level <italic>F</italic><sub>1</sub> &#x2264;0.3). In contrast, the significantly higher scores achieved in this study highlight the critical importance of supervised fine-tuning in Japanese clinical NER, demonstrating that merely prompting non&#x2013;fine-tuned LLMs is insufficient for achieving adequate performance in this context.</p></sec><sec id="s4-4"><title>Differences in Characteristics Between Instruction Tuning and Token Classification</title><p>The differences in the properties of instruction tuning and token classification were analyzed from the perspective of the length of the extracted segments. Table S1 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> presents evaluation results using the marker matching score, which does not account for the length of the extraction targets. In contrast to <xref ref-type="table" rid="table2">Table 2</xref>, where longer extracted segments yield higher scores, Table S1 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> simply counts each extraction as one regardless of its length. Consequently, frequently occurring short terms (eg, &#x201C;lung cancer&#x201D;) that are repeatedly extracted may appear to achieve a high score.</p><p>Comparing <xref ref-type="table" rid="table2">Table 2</xref> and Table S1 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> reveals that, for instruction tuning, all models except GPT-NeoX-3.6B attained higher <italic>F</italic><sub>1</sub>-scores in Table S1 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p><p>In addition, we stratified the gold entities into 5 character-length bins (1&#x2010;2, 3&#x2010;6, 7&#x2010;10, 11&#x2010;20, and &#x2265;21) and computed precision, recall, and <italic>F</italic><sub>1</sub>-score within each bin (Tables S19-S21 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). Under the weighted soft matching score with <italic>p</italic>=1 (equivalent to character-level evaluation), the decoder model (Swallow-70B) achieved the highest <italic>F</italic><sub>1</sub>-score for the shortest spans (1&#x2010;2 characters; <italic>F</italic><sub>1</sub>-score=0.814, SD 0.017), but its performance dropped rapidly as the span length increased (&#x2265;21 characters; <italic>F</italic><sub>1</sub>-score=0.677, SD 0.015). In contrast, the encoder model (JMedDeBERTa(s)) showed relatively low <italic>F</italic><sub>1</sub>-score for the extremely short spans (1&#x2010;2 characters), which account for only 9% of the dataset, while consistently outperforming the decoder model for spans of 3 characters or longer and maintaining robust performance even for long spans (&#x2265;10 characters). A similar trend was also observed in the stratified analysis based on the marker matching score, which does not weight span length.</p><p>This suggests that instruction tuning is effective for extracting short terms but less so for long expressions. When using instruction tuning, both sides of the extraction segments must be enclosed with markers. For E-CCR, which includes long phrases, the attention mechanism must retain these extended, marker-enclosed segments. This requirement may account for the discrepancy with a previous study [<xref ref-type="bibr" rid="ref28">28</xref>] that reported superior performance for instruction tuning compared with encoder models when extracting terms or short phrases (eg, disease names or gene identifiers). The fact that encoder models outperformed decoder models for spans of 3 characters or longer&#x2014;which constitute the majority (91%) of entities in E-CCR&#x2014;is also clinically meaningful.</p><p>A comparison of decoder models using token classification reveals higher <italic>F</italic><sub>1</sub>-scores in <xref ref-type="table" rid="table2">Table 2</xref> than in Table S1 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>. This suggests that token classification remains effective for extracting long phrases. When using token classification, the model assigns a 0 or 1 to each token, and this token-level information can be used in subsequent outputs, potentially making it easier to handle long markers than instruction tuning.</p><p>When long expressions are extracted, unintended portions of the text may also be included, potentially reducing precision. However, in applications such as automated knowledge graph construction, such noise is not critical, as it can be filtered out during subsequent processing (eg, normalization). The key point is that the relatively long expressions that need to be extracted are reliably captured. Token classification achieved a higher recall than instruction tuning and is therefore well suited to applications where E-CCR is followed by a series of postprocessing steps, such as normalization of the extracted segments and causal inference among terms, as in automatic knowledge graph construction.</p><p>Seventeen models were compared using the weighted soft matching score, which accounts for penalties when the extracted segments are fragmented. In particular, when <italic>p</italic>=1, the evaluation is equivalent to that based on character-level segmentation. Across all <italic>p</italic> values, token classification consistently outperformed instruction tuning, and encoder models outperformed decoder models. The scores did not decrease substantially at higher <italic>p</italic>, suggesting that cases in which the marker positions become fragmented during extraction are rare.</p><p><xref ref-type="fig" rid="figure6">Figure 6</xref> shows an example of the output results (for the test set) of JMedDeBERTa(s), which exhibited the best performance. The output results include (1) cases in which the extraction of critical clinical concepts was performed correctly and (2) cases in which the extraction of critical clinical concepts failed. In particular, in (2), as shown in <xref ref-type="fig" rid="figure6">Figure 6</xref>, cases exist where the model does not extract the part that should be extracted; however, cases also exist where the output results are not necessarily wrong, as shown in Figures S3-S5 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>, and cases exist where the <italic>F</italic><sub>1</sub>-score decreases because of labeling-related problems. Therefore, the actual model performance may be underestimated and warrants further investigation in future research.</p><fig position="float" id="figure6"><label>Figure 6.</label><caption><p>Example of perfect extraction by JMedDeBERTa(s) in the test set. The upper row shows the original Japanese case report text, and the lower row shows the corresponding English translation. Blue highlights indicate gold-standard clinically essential expressions, and red highlights indicate spans predicted by JMedDeBERTa(s) fine-tuned with token classification. In this example, the predicted spans exactly match the gold standard, resulting in a weighted soft matching F1 score of 1.0.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v28i1e78681_fig06.png"/></fig><p>To gauge the potential impact of such labeling issues on performance evaluation, we randomly sampled 200 cases from the test set and reassessed the correspondence between the gold-standard labels and the original case report texts. As a result, we found that in 8 of the 200 cases (4%), at least one gold-standard label required correction; in most of these cases, the problem was attributable not to completely inappropriate concepts, but rather to omissions of important expressions that should have been included or to deviations in span boundaries. Thus, although the overall quality of the gold standard appears to be generally high, the actual model performance may be slightly underestimated, and a more systematic investigation of this issue will be needed in future work. These findings suggest that the measured <italic>F</italic><sub>1</sub>-score may be partially bounded by residual label noise and annotation subjectivity, potentially constituting a performance ceiling in agreement with the reference standard.</p></sec><sec id="s4-5"><title>Novelty of This Study</title><p>This study is the first to compare instruction tuning and token classification and decoder and encoder models, in the context of the comprehensive E-CCR task.</p><p>In clinical concept extraction, capturing both long and conventional short expressions is essential. This study extended conventional NER, which typically targets words or short phrases, by proposing an E-CCR task that includes long expressions as extraction targets together with a weighted soft matching score. This extension represents a significant contribution of this work. To the best of our knowledge, few studies have focused on extracting long expressions to the extent achieved in this study. The study by Chen et al [<xref ref-type="bibr" rid="ref15">15</xref>] addressed long expression extraction by summarizing discharge summaries through segment identification from the texts using AlphaBERT. Although Chen et al focused on extracting expressions representing single clinical concepts with AlphaBERT, their evaluation was limited to character-level analysis and did not incorporate metrics penalizing fragmented extraction. In contrast, the E-CCR framework explicitly includes long expressions as extraction targets. Chen et al.&#x2019;s study was limited to comparative evaluations among AlphaBERT, BERT, BioBERT, and LSTM models, without incorporating analyses using the rapidly advancing decoder models. The inclusion of comparative evaluations using various decoder models alongside encoder models constitutes a significant advancement.</p></sec><sec id="s4-6"><title>Future Directions</title><p>E-CCR can be applied not only to knowledge extraction for diagnostic support but also to other extraction tasks involving relatively long expressions. For example, consider a case in which information on daily living functions is extracted from a text and coded using the International Classification of Functioning, Disability and Health. When making a judgment of &#x201C;&#x56A5;&#x4E0B;&#x6A5F;&#x80FD;&#x4F4E;&#x4E0B; (&#x4E2D;&#x7B49;&#x5EA6;)&#x201D;&#x003C;b5105.2&#x003E; (&#x201C;swallowing function decline (moderate)&#x201D;&#x003C;b5105.2&#x003E;) in response to the expression &#x201C;&#x56A5;&#x4E0B;&#x9020;&#x5F71;&#x691C;&#x67FB;:&#x53CD;&#x5C04;&#x60F9;&#x8D77;&#x9045;&#x5EF6;(+)&#x3002;&#x6DB2;&#x4F53;&#x3067;&#x5C11;&#x91CF;&#x306E;&#x8AA4;&#x56A5;&#x3042;&#x308A;, &#x30DA;&#x30FC;&#x30B9;&#x30C8;&#x98DF;&#x3067;&#x5C11;&#x91CF;&#x306E;&#x6B8B;&#x7559;&#x3042;&#x308A;&#x3002;&#x54B3;&#x53CD;&#x5C04;&#x3042;&#x308B;&#x304C;&#x5FAE;&#x5F31;&#x201D; (&#x201C;swallowing contrast study: reflex-induced delay (+). Small amount of aspiration with liquid, small amount of residual with paste food. Cough reflex present but weak&#x201D;), the part corresponding to b5105.2 must be extracted, which is not a single word but a single description that spans multiple sentences. Additionally, numerous other applications can be envisioned, such as International Statistical Classification of Diseases and Related Health Problems coding of patient condition descriptions in clinical text, thereby providing valuable insights for extraction tasks across a wide range of medical texts.</p></sec><sec id="s4-7"><title>Limitations</title><p>First, all experiments were conducted on Japanese clinical case reports. As Japanese differs substantially from English and other languages that dominate clinical natural language processing research&#x2014;most notably in the lack of whitespace and the use of mixed scripts&#x2014;our findings may not directly generalize to other languages or document types.</p><p>Second, for instruction-tuned decoder-based models, we evaluated only a single prompt template and a single instruction or output format, following prior work [<xref ref-type="bibr" rid="ref23">23</xref>,<xref ref-type="bibr" rid="ref28">28</xref>], in which extraction spans are generated as inline boundary tags using the @@ and ## markers. We did not perform a systematic prompt engineering study (eg, ablations over instruction wording, formatting, or response schemas). As LLM extraction performance is sensitive to both prompt design and output representation, alternative prompting strategies and constrained output formats&#x2014;such as returning a structured list of extracted spans, structured JSON with character offsets, or other boundary representations&#x2014;may lead to different results. We also did not explore reasoning-augmented prompting (eg, rationale or chain-of-thought prompting), including variants where reasoning is elicited but the final answer is constrained to a structured schema, which could affect generative extraction behavior. Thus, the observed performance gap between encoder and decoder models may partly reflect the difficulty of generating precise character-level boundary tags under this specific template or format. Future work should systematically compare multiple prompting or output strategies to disentangle the effects of model architecture from output format constraints.</p><p>Third, to maintain consistent label-token alignment across models, we disabled byte fallback. Consequently, out-of-vocabulary characters were normalized to &#x003C;unk&#x003E;, potentially discarding surface cues for rare kanji and specialized medical symbols. This may have disadvantaged models with smaller vocabularies, including GPT-NeoX-3.6B, DeBERTaV2-base, RoBERTa-base, and the JMedRoBERTa/JMedDeBERTa variants. Notably, however, all models used for token classification&#x2014;both decoder only and encoder based&#x2014;were evaluated under this same unfavorable setting (ie, with byte fallback disabled), yet the token classification still outperformed instruction tuning overall.</p><p>Fourth, owing to computational constraints, we applied parameter-efficient fine-tuning with LoRA to decoder models, whereas encoder models were fully fine-tuned. Although LoRA has been reported to achieve performance comparable to full fine-tuning [<xref ref-type="bibr" rid="ref41">41</xref>], its effectiveness can vary depending on the task and hyperparameter settings; therefore, caution is warranted when interpreting these comparative results.</p><p>Fifth, for decoder-only models in token classification, we retained the default causal attention mask; therefore, token-level predictions could not leverage right-side context. As bidirectional information is often helpful for sequence labeling, this design may disadvantage causal-masked decoders relative to encoders. Approaches that enable bidirectional information in decoder-only LLM (eg, layer-wise causal mask removal) have been reported to improve sequence labeling performance [<xref ref-type="bibr" rid="ref42">42</xref>,<xref ref-type="bibr" rid="ref43">43</xref>]. Therefore, our conclusion that encoder-based token classification outperformed decoder-based approaches should be interpreted as holding for decoder-only models used with the default causal mask under our fine-tuning constraints and may not generalize to decoder variants explicitly adapted to incorporate right-context information.</p><p>Sixth, comparisons between general-domain and medical-domain pretraining are confounded by substantial differences in corpus size (362 GB vs 0.56 GB) and data composition. The strong performance of general-domain models&#x2014;despite limited exposure to specialized medical terminology&#x2014;may reflect not only syntactic knowledge but also other advantages conferred by massive training data. Rigorous ablation studies that control for architecture, corpus size, and training conditions are therefore needed. Further research is also required to determine whether the observed encoder-decoder performance gap, the relative benefits of domain-specific versus general-domain pretraining, and the effects of prompting strategies generalize to other languages, clinical settings, and medical document types.</p><p>Seventh, the weighted soft matching score is an automated evaluation metric whose clinical validity has not yet been validated, including its alignment with clinician judgments, and its usefulness for downstream tasks. Moreover, as the score is computed based on the span boundaries of the reference annotations, it may penalize predictions even when boundary differences are clinically acceptable. Future work should quantify correlations with clinician ratings and downstream task performance to establish the metric&#x2019;s clinical validity.</p><p>Finally, our reference annotations represent an adjudicated consensus rather than an objective ground truth. As preadjudication annotations were not retained, we cannot report IAA to quantify the inherent subjectivity of the task. In a post hoc audit, 8 (4%) of 200 randomly sampled test cases contained at least one span requiring correction, mostly due to omissions or boundary deviations, indicating residual label noise and subjective boundary decisions. Moreover, as illustrated by examples where model predictions are clinically plausible despite differing from the reference, span-level <italic>F</italic><sub>1</sub>-score should be interpreted as agreement with this consensus reference, and may be bounded by a ceiling on reference-based performance rather than model capability. This limitation primarily affects the absolute magnitude of the scores; however, as all models were evaluated against the same reference standard under the same metric, relative comparisons across models remain informative. Future work will retain independent preadjudication labels to compute IAA and will complement reference-based metrics with clinician-centered acceptability assessments and/or more tolerant scoring schemes.</p></sec><sec id="s4-8"><title>Conclusions</title><p>This study is the first to investigate effective model selection for the E-CCR task&#x2014;an extraction task targeting terms and extended expressions (eg, diseases, conditions, and clinical findings considered critical for differential diagnosis) that are indispensable for the automatic construction of causal relationship knowledge from case reports&#x2014;by comparing encoder versus decoder models, as well as general-purpose versus domain-specific models. Additionally, we introduced a novel evaluation metric tailored to the E-CCR task, the weighted soft matching score. Furthermore, we analyzed model effectiveness in relation to the length of the extraction targets.</p><p>As a result, encoder-based token classification achieved the best performance in our evaluation. Under our evaluation setting&#x2014;where decoder models were constrained to output entity spans using inline boundary tags (eg, @@ and ##)&#x2014;token classification generally outperformed instruction tuning. Moreover, among the 17 models compared, we observed no case where a decoder model outperformed its corresponding encoder model. These findings suggest that, for the E-CCR task, token classification&#x2013;based extraction with encoder models can achieve high accuracy with relatively fewer parameters, making it advantageous in resource-constrained environments. This efficiency is also beneficial for real-world deployment in clinical practice, where inference latency and throughput are critical. That said, although encoder token classification outperformed decoder approaches under our standard fine-tuning recipe, decoder performance may improve with alternative prompting strategies, the use of reasoning-oriented models (ie, increased inference-time computation), or architectural or training modifications such as layer-wise relaxation of the causal mask for instruction tokens.</p><p>In contrast, while JMedDeBERTa(s), pretrained on medical text, obtained the highest average score on the E-CCR task, DeBERTaV2-base achieved the highest average score on the MRNER disease benchmark. These results indicate that there is no single universally optimal model; the relative advantage of domain-specific versus general-domain pretraining likely depends on the characteristics of the target task and the dataset.</p><p>Our results examining the effect of domain-specific texts relative to general texts align with previous findings [<xref ref-type="bibr" rid="ref33">33</xref>], indicating only marginal performance differences between models trained with and without domain-specific texts.</p><p>Furthermore, our findings indicate that models trained on relatively small amounts of domain-specific text can match or exceed the performance of models trained on large general corpora. Conversely, in situations where domain-specific text is scarce, models trained solely on general corpora may still deliver high performance. Therefore, model selection should be guided by the quantity and characteristics of the available training texts.</p><p>An analysis of the differences between instruction tuning and token classification with respect to the length of the extracted segments revealed that instruction tuning is well suited for extracting short terms but is less effective for extracting long expressions. In contrast, token classification proved effective for extracting extended phrases. Furthermore, evaluation using the weighted soft matching score showed that increasing the fragmentation penalty did not substantially degrade model performance. This suggests that marker positions split during extraction are infrequent, even when extracting long expressions.</p><p>A detailed analysis of the outputs from JMedDeBERTa(s) revealed inaccuracies in some manually assigned correct labels. This observation implies that actual model accuracy may surpass the reported figures.</p><p>Extracting clinical concepts necessitates targeting both conventional short expressions and extended phrases. To our knowledge, few prior studies have comprehensively addressed extended expression extraction in clinical texts. In this study, we performed model comparisons on a more comprehensive E-CCR task using the weighted matching score, obtaining insights into model selection, effectiveness with respect to expression length, and the impact of domain-specific texts. These findings constitute a substantial contribution to the advancement of clinical information extraction methodologies.</p></sec></sec></body><back><notes><sec><title>Funding</title><p>This work was partially supported by the Cross-ministerial Strategic Innovation Promotion Program on &#x201C;Integrated Health Care System&#x201D; (grant JPJ012425).</p></sec><sec><title>Data Availability</title><p>The datasets generated or analyzed in this study are not publicly available due to restrictions imposed by a data use agreement with the Japanese Society of Internal Medicine. However, subject to approval by the Japanese Society of Internal Medicine and the relevant affiliated institutions and/or regulatory authorities, the data may be made available from the corresponding author upon reasonable request.</p></sec></notes><fn-group><fn fn-type="con"><p>YT contributed to reviewing and editing, writing the original draft, visualization, validation, software, methodology, and investigation. SK contributed to reviewing and editing and methodology. MI contributed to reviewing and editing and investigation. RN contributed to data curation, resources, funding acquisition, and supervision. TI contributed to conceptualization, data curation, resources, reviewing and editing, writing the original draft, methodology, investigation, funding acquisition, and supervision.</p></fn><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">BERT</term><def><p>Bidirectional Encoder Representations from Transformers</p></def></def-item><def-item><term id="abb2">E-CCR</term><def><p>extended clinical concept recognition</p></def></def-item><def-item><term id="abb3">IAA</term><def><p>interannotator agreement</p></def></def-item><def-item><term id="abb4">LLM</term><def><p>large language model</p></def></def-item><def-item><term id="abb5">LoRA</term><def><p>low-rank adaptation</p></def></def-item><def-item><term id="abb6">LSTM</term><def><p>long short-term memory</p></def></def-item><def-item><term id="abb7">MLM</term><def><p>masked language modeling</p></def></def-item><def-item><term id="abb8">MRNER disease</term><def><p>Medical Report Named Entity Recognition for positive disease</p></def></def-item><def-item><term id="abb9">NER</term><def><p>Named Entity Recognition</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="web"><article-title>About &#x201C;J-CaseMap: case search for difficult-to-diagnose cases&#x201D; [Article in Japanese]</article-title><source>The Japanese Society of Internal Medicine</source><year>2024</year><access-date>2024-12-01</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.naika.or.jp/j-casemap">https://www.naika.or.jp/j-casemap</ext-link></comment></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Keraghel</surname><given-names>I</given-names> </name><name name-style="western"><surname>Morbieu</surname><given-names>S</given-names> </name><name name-style="western"><surname>Nadif</surname><given-names>M</given-names> </name></person-group><article-title>Recent advances in named entity recognition: a comprehensive survey and comparative study</article-title><source>arXiv</source><comment>Preprint posted online on  Jan 19, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2401.10825</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Madan</surname><given-names>S</given-names> </name><name name-style="western"><surname>Lentzen</surname><given-names>M</given-names> </name><name name-style="western"><surname>Brandt</surname><given-names>J</given-names> </name><name name-style="western"><surname>Rueckert</surname><given-names>D</given-names> </name><name name-style="western"><surname>Hofmann-Apitius</surname><given-names>M</given-names> </name><name name-style="western"><surname>Fr&#x00F6;hlich</surname><given-names>H</given-names> </name></person-group><article-title>Transformer models in biomedicine</article-title><source>BMC Med Inform Decis Mak</source><year>2024</year><month>07</month><day>29</day><volume>24</volume><issue>1</issue><fpage>214</fpage><pub-id pub-id-type="doi">10.1186/s12911-024-02600-5</pub-id><pub-id pub-id-type="medline">39075407</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Cho</surname><given-names>HN</given-names> </name><name name-style="western"><surname>Jun</surname><given-names>TJ</given-names> </name><name name-style="western"><surname>Kim</surname><given-names>YH</given-names> </name><etal/></person-group><article-title>Task-specific transformer-based language models in health care: scoping review</article-title><source>JMIR Med Inform</source><year>2024</year><month>11</month><day>18</day><volume>12</volume><issue>12</issue><fpage>e49724</fpage><pub-id pub-id-type="doi">10.2196/49724</pub-id><pub-id pub-id-type="medline">39556827</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Luo</surname><given-names>X</given-names> </name><name name-style="western"><surname>Deng</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>B</given-names> </name><name name-style="western"><surname>Luo</surname><given-names>MY</given-names> </name></person-group><article-title>Pre-trained language models in medicine: a survey</article-title><source>Artif Intell Med</source><year>2024</year><month>08</month><volume>154</volume><issue>154</issue><fpage>102904</fpage><pub-id pub-id-type="doi">10.1016/j.artmed.2024.102904</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hu</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Hou</surname><given-names>W</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>X</given-names> </name></person-group><article-title>Deep learning for named entity recognition: a survey</article-title><source>Neural Comput &#x0026; Applic</source><year>2024</year><month>06</month><volume>36</volume><issue>16</issue><fpage>8995</fpage><lpage>9022</lpage><pub-id pub-id-type="doi">10.1007/s00521-024-09646-6</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Yada</surname><given-names>S</given-names> </name><name name-style="western"><surname>Nakamura</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Wakamiya</surname><given-names>S</given-names> </name><etal/></person-group><article-title>REAL-MedNL: overview of REAL document-based medical natural language processing task</article-title><year>2022</year><conf-name>Proceedings of the 16th NTCIR Conference on Evaluation of Information Access Technologies</conf-name><conf-date>Jun 14-17, 2022</conf-date><pub-id pub-id-type="doi">10.20736/0002002295</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kajiyama</surname><given-names>K</given-names> </name><name name-style="western"><surname>Horiguchi</surname><given-names>H</given-names> </name><name name-style="western"><surname>Okumura</surname><given-names>T</given-names> </name><name name-style="western"><surname>Morita</surname><given-names>M</given-names> </name><name name-style="western"><surname>Kano</surname><given-names>Y</given-names> </name></person-group><article-title>De-identifying free text of Japanese electronic health records</article-title><source>J Biomed Semantics</source><year>2020</year><month>09</month><day>21</day><volume>11</volume><issue>1</issue><fpage>11</fpage><pub-id pub-id-type="doi">10.1186/s13326-020-00227-9</pub-id><pub-id pub-id-type="medline">32958039</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Nath</surname><given-names>N</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>SH</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>I</given-names> </name></person-group><article-title>NEAR: named entity and attribute recognition of clinical concepts</article-title><source>J Biomed Inform</source><year>2022</year><month>06</month><volume>130</volume><issue>130</issue><fpage>104092</fpage><pub-id pub-id-type="doi">10.1016/j.jbi.2022.104092</pub-id><pub-id pub-id-type="medline">35533990</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Yada</surname><given-names>S</given-names> </name><name name-style="western"><surname>Joh</surname><given-names>A</given-names> </name><name name-style="western"><surname>Tanaka</surname><given-names>R</given-names> </name><etal/></person-group><article-title>Towards a versatile medical-annotation guideline feasible without heavy medical knowledge: starting from critical lung diseases</article-title><year>2020</year><access-date>2026-04-28</access-date><conf-name>Proceedings of the 12th Language Resources and Evaluation Conference</conf-name><conf-date>May 11-16, 2020</conf-date><comment><ext-link ext-link-type="uri" xlink:href="https://aclanthology.org/2020.lrec-1.561/">https://aclanthology.org/2020.lrec-1.561/</ext-link></comment></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ohno</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Kato</surname><given-names>R</given-names> </name><name name-style="western"><surname>Ishikawa</surname><given-names>H</given-names> </name><etal/></person-group><article-title>Using the natural language processing system medical named entity recognition-Japanese to analyze pharmaceutical care records: natural language processing analysis</article-title><source>JMIR Form Res</source><year>2024</year><month>06</month><day>4</day><volume>8</volume><issue>8</issue><fpage>e55798</fpage><pub-id pub-id-type="doi">10.2196/55798</pub-id><pub-id pub-id-type="medline">38833694</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kawazoe</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Shibata</surname><given-names>D</given-names> </name><name name-style="western"><surname>Shinohara</surname><given-names>E</given-names> </name><name name-style="western"><surname>Aramaki</surname><given-names>E</given-names> </name><name name-style="western"><surname>Ohe</surname><given-names>K</given-names> </name></person-group><article-title>A clinical specific BERT developed using a huge Japanese clinical text corpus</article-title><source>PLoS One</source><year>2021</year><volume>16</volume><issue>11</issue><fpage>e0259763</fpage><pub-id pub-id-type="doi">10.1371/journal.pone.0259763</pub-id><pub-id pub-id-type="medline">34752490</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Nishiyama</surname><given-names>T</given-names> </name><name name-style="western"><surname>Yamaguchi</surname><given-names>A</given-names> </name><name name-style="western"><surname>Han</surname><given-names>P</given-names> </name><etal/></person-group><article-title>Automated system to capture patient symptoms from multitype Japanese clinical texts: retrospective study</article-title><source>JMIR Med Inform</source><year>2024</year><month>09</month><day>24</day><volume>12</volume><fpage>e58977</fpage><pub-id pub-id-type="doi">10.2196/58977</pub-id><pub-id pub-id-type="medline">39316418</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Lin</surname><given-names>C</given-names> </name></person-group><article-title>ROUGE: a package for automatic evaluation of summaries</article-title><year>2004</year><access-date>2026-04-28</access-date><conf-name>Proceedings of the Text Summarization Branches Out Workshop</conf-name><conf-date>Jul 25-26, 2004</conf-date><conf-loc>Barcelona, Spain</conf-loc><publisher-name>Association for Computational Linguistics</publisher-name><fpage>74</fpage><lpage>81</lpage><comment><ext-link ext-link-type="uri" xlink:href="https://aclanthology.org/W04-1013/">https://aclanthology.org/W04-1013/</ext-link></comment></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Chen</surname><given-names>YP</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>YY</given-names> </name><name name-style="western"><surname>Lin</surname><given-names>JJ</given-names> </name><name name-style="western"><surname>Huang</surname><given-names>CH</given-names> </name><name name-style="western"><surname>Lai</surname><given-names>F</given-names> </name></person-group><article-title>Modified bidirectional encoder representations from transformers extractive summarization model for hospital information systems based on character-level tokens (AlphaBERT): development and performance evaluation</article-title><source>JMIR Med Inform</source><year>2020</year><month>04</month><day>29</day><volume>8</volume><issue>4</issue><fpage>e17787</fpage><pub-id pub-id-type="doi">10.2196/17787</pub-id><pub-id pub-id-type="medline">32347806</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="web"><person-group person-group-type="author"><name name-style="western"><surname>Lafferty</surname><given-names>J</given-names> </name><name name-style="western"><surname>McCallum</surname><given-names>A</given-names> </name><name name-style="western"><surname>Pereira</surname><given-names>F</given-names> </name></person-group><article-title>Conditional random fields: probabilistic models for segmenting and labeling sequence data</article-title><year>2001</year><access-date>2026-04-14</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://repository.upenn.edu/handle/20.500.14332/6188">https://repository.upenn.edu/handle/20.500.14332/6188</ext-link></comment></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Graves</surname><given-names>A</given-names> </name></person-group><article-title>Supervised sequence labelling with recurrent neural networks</article-title><source>Studies in Computational Intelligence</source><year>2012</year><access-date>2026-04-14</access-date><publisher-name>Springer</publisher-name><comment><ext-link ext-link-type="uri" xlink:href="https://link.springer.com/10.1007/978-3-642-24797-2">https://link.springer.com/10.1007/978-3-642-24797-2</ext-link></comment><pub-id pub-id-type="doi">10.1007/978-3-642-24797-2</pub-id><pub-id pub-id-type="other">978-3-642-24797-2</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Devlin</surname><given-names>J</given-names> </name><name name-style="western"><surname>Chang</surname><given-names>MW</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>K</given-names> </name><name name-style="western"><surname>Toutanova</surname><given-names>K</given-names> </name></person-group><article-title>BERT: pre-training of deep bidirectional transformers for language understanding</article-title><year>2019</year><conf-name>Proceedings of the 2019 Conference of the North</conf-name><conf-date>Jun 2-7, 2019</conf-date><pub-id pub-id-type="doi">10.18653/v1/N19-1423</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Vaswani</surname><given-names>A</given-names> </name><name name-style="western"><surname>Shazeer</surname><given-names>N</given-names> </name><name name-style="western"><surname>Parmar</surname><given-names>N</given-names> </name><etal/></person-group><article-title>Attention is all you need</article-title><year>2017</year><access-date>2026-04-28</access-date><conf-name>Proceedings of the 31st Conference on Neural Information Processing Systems</conf-name><conf-date>Dec 4-9, 2017</conf-date><comment><ext-link ext-link-type="uri" xlink:href="https://proceedings.neurips.cc/paper/2017/hash/3f5ee243547dee91fbd053c1c4a845aa-Abstract.html">https://proceedings.neurips.cc/paper/2017/hash/3f5ee243547dee91fbd053c1c4a845aa-Abstract.html</ext-link></comment></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Xu</surname><given-names>D</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>W</given-names> </name><name name-style="western"><surname>Peng</surname><given-names>W</given-names> </name><etal/></person-group><article-title>Large language models for generative information extraction: a survey</article-title><source>Front Comput Sci</source><year>2024</year><month>12</month><volume>18</volume><issue>6</issue><fpage>186357</fpage><pub-id pub-id-type="doi">10.1007/s11704-024-40555-y</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Zhou</surname><given-names>Q</given-names> </name><name name-style="western"><surname>Junfeng</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Ding</surname><given-names>H</given-names> </name><name name-style="western"><surname>Song</surname><given-names>J</given-names> </name></person-group><article-title>A knowledge-enhanced medical named entity recognition method that integrates pre-trained language models</article-title><year>2023</year><conf-name>2023 IEEE International Conference on Medical Artificial Intelligence (MedAI)</conf-name><conf-date>Apr 14-19, 2024</conf-date><pub-id pub-id-type="doi">10.1109/MedAI59581.2023.00046</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Ashok</surname><given-names>D</given-names> </name><name name-style="western"><surname>Lipton</surname><given-names>ZC</given-names> </name></person-group><article-title>PromptNER: prompting for named entity recognition</article-title><source>arXiv</source><comment>Preprint posted online on  May 24, 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2305.15444</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>S</given-names> </name><name name-style="western"><surname>Sun</surname><given-names>X</given-names> </name><name name-style="western"><surname>Li</surname><given-names>X</given-names> </name><etal/></person-group><article-title>GPT-NER: named entity recognition via large language models</article-title><conf-name>Findings of the Association for Computational Linguistics</conf-name><conf-date>Apr 29 to May 4, 2025</conf-date><pub-id pub-id-type="doi">10.18653/v1/2025.findings-naacl.239</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hu</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>Q</given-names> </name><name name-style="western"><surname>Du</surname><given-names>J</given-names> </name><etal/></person-group><article-title>Improving large language models for clinical named entity recognition via prompt engineering</article-title><source>J Am Med Inform Assoc</source><year>2024</year><month>09</month><day>1</day><volume>31</volume><issue>9</issue><fpage>1812</fpage><lpage>1820</lpage><pub-id pub-id-type="doi">10.1093/jamia/ocad259</pub-id><pub-id pub-id-type="medline">38281112</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Brown</surname><given-names>TB</given-names> </name><name name-style="western"><surname>Mann</surname><given-names>B</given-names> </name><name name-style="western"><surname>Ryder</surname><given-names>N</given-names> </name><etal/></person-group><article-title>Language models are few-shot learners</article-title><year>2020</year><access-date>2026-04-28</access-date><conf-name>Proceedings of the 34th International Conference on Neural Information Processing Systems</conf-name><conf-date>Dec 6-12, 2020</conf-date><conf-loc>Vancouver, BC, Canada (virtual</conf-loc><fpage>1877</fpage><lpage>1901</lpage><comment><ext-link ext-link-type="uri" xlink:href="https://proceedings.neurips.cc/paper/2020/hash/1457c0d6bfcb4967418bfb8ac142f64a-Abstract.html">https://proceedings.neurips.cc/paper/2020/hash/1457c0d6bfcb4967418bfb8ac142f64a-Abstract.html</ext-link></comment></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Achiam</surname><given-names>J</given-names> </name><name name-style="western"><surname>Adler</surname><given-names>S</given-names> </name><name name-style="western"><surname>Agarwal</surname><given-names>S</given-names> </name><etal/></person-group><article-title>GPT-4 technical report</article-title><source>arXiv</source><comment>Preprint posted online on  Mar 15, 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2303.08774</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Sun</surname><given-names>X</given-names> </name><name name-style="western"><surname>Dong</surname><given-names>L</given-names> </name><name name-style="western"><surname>Li</surname><given-names>X</given-names> </name><etal/></person-group><article-title>Pushing the limits of chatgpt on NLP tasks</article-title><source>arXiv</source><comment>Preprint posted online on  Jun 16, 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2306.09719</pub-id></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Keloth</surname><given-names>VK</given-names> </name><name name-style="western"><surname>Hu</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Xie</surname><given-names>Q</given-names> </name><etal/></person-group><article-title>Advancing entity recognition in biomedicine via instruction tuning of large language models</article-title><source>Bioinformatics</source><year>2024</year><month>03</month><day>29</day><volume>40</volume><issue>4</issue><pub-id pub-id-type="doi">10.1093/bioinformatics/btae163</pub-id></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Li</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Li</surname><given-names>X</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>Y</given-names> </name><etal/></person-group><article-title>Label supervised llama finetuning</article-title><source>arXiv</source><comment>Preprint posted online on  Oct 2, 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2310.01208</pub-id></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Yu</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>X</given-names> </name><name name-style="western"><surname>Sweeting</surname><given-names>GL</given-names> </name><etal/></person-group><article-title>Identify diabetic retinopathy-related clinical concepts and their attributes using transformer-based natural language processing methods</article-title><source>BMC Med Inform Decis Mak</source><year>2022</year><month>09</month><day>27</day><volume>22</volume><issue>Suppl 3</issue><fpage>255</fpage><pub-id pub-id-type="doi">10.1186/s12911-022-01996-2</pub-id><pub-id pub-id-type="medline">36167551</pub-id></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lee</surname><given-names>J</given-names> </name><name name-style="western"><surname>Yoon</surname><given-names>W</given-names> </name><name name-style="western"><surname>Kim</surname><given-names>S</given-names> </name><etal/></person-group><article-title>BioBERT: a pre-trained biomedical language representation model for biomedical text mining</article-title><source>Bioinformatics</source><year>2020</year><month>02</month><day>15</day><volume>36</volume><issue>4</issue><fpage>1234</fpage><lpage>1240</lpage><pub-id pub-id-type="doi">10.1093/bioinformatics/btz682</pub-id></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lentzen</surname><given-names>M</given-names> </name><name name-style="western"><surname>Madan</surname><given-names>S</given-names> </name><name name-style="western"><surname>Lage-Rupprecht</surname><given-names>V</given-names> </name><etal/></person-group><article-title>Critical assessment of transformer-based AI models for German clinical notes</article-title><source>JAMIA Open</source><year>2022</year><month>12</month><volume>5</volume><issue>4</issue><fpage>ooac087</fpage><pub-id pub-id-type="doi">10.1093/jamiaopen/ooac087</pub-id><pub-id pub-id-type="medline">36380848</pub-id></nlm-citation></ref><ref id="ref33"><label>33</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Garc&#x00ED;a Subies</surname><given-names>G</given-names> </name><name name-style="western"><surname>Barbero Jim&#x00E9;nez</surname><given-names>&#x00C1;</given-names> </name><name name-style="western"><surname>Mart&#x00ED;nez Fern&#x00E1;ndez</surname><given-names>P</given-names> </name></person-group><article-title>A comparative analysis of Spanish Clinical encoder-based models on NER and classification tasks</article-title><source>J Am Med Inform Assoc</source><year>2024</year><month>09</month><day>1</day><volume>31</volume><issue>9</issue><fpage>2137</fpage><lpage>2146</lpage><pub-id pub-id-type="doi">10.1093/jamia/ocae054</pub-id><pub-id pub-id-type="medline">38489543</pub-id></nlm-citation></ref><ref id="ref34"><label>34</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>He</surname><given-names>P</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>X</given-names> </name><name name-style="western"><surname>Gao</surname><given-names>J</given-names> </name><etal/></person-group><article-title>DeBERTa: decoding-enhanced BERT with disentangled attention</article-title><year>2021</year><access-date>2026-04-28</access-date><conf-name>Proceedings of the International Conference on Learning Representations</conf-name><conf-date>May 3-7, 2021</conf-date><comment><ext-link ext-link-type="uri" xlink:href="https://openreview.net/forum?id=XPZIaotutsD">https://openreview.net/forum?id=XPZIaotutsD</ext-link></comment></nlm-citation></ref><ref id="ref35"><label>35</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Liu</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Ott</surname><given-names>M</given-names> </name><name name-style="western"><surname>Goyal</surname><given-names>N</given-names> </name><etal/></person-group><article-title>RoBERTa: a robustly optimized BERT pretraining approach</article-title><source>arXiv</source><comment>Preprint posted online on  Jul 26, 2019</comment><pub-id pub-id-type="doi">10.48550/arXiv.1907.11692</pub-id></nlm-citation></ref><ref id="ref36"><label>36</label><nlm-citation citation-type="report"><person-group person-group-type="author"><name name-style="western"><surname>Radford</surname><given-names>A</given-names> </name><name name-style="western"><surname>Wu</surname><given-names>J</given-names> </name><name name-style="western"><surname>Child</surname><given-names>R</given-names> </name><etal/></person-group><article-title>Language models are unsupervised multitask learners</article-title><year>2019</year><access-date>2026-04-28</access-date><publisher-name>OpenAI Blog</publisher-name><comment><ext-link ext-link-type="uri" xlink:href="https://cdn.openai.com/better-language-models/language_models_are_unsupervised_multitask_learners.pdf">https://cdn.openai.com/better-language-models/language_models_are_unsupervised_multitask_learners.pdf</ext-link></comment></nlm-citation></ref><ref id="ref37"><label>37</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Black</surname><given-names>S</given-names> </name><name name-style="western"><surname>Biderman</surname><given-names>S</given-names> </name><name name-style="western"><surname>Hallahan</surname><given-names>E</given-names> </name><etal/></person-group><article-title>GPT-NeoX-20B: an open-source autoregressive language model</article-title><year>2022</year><conf-name>Proceedings of BigScience Episode #5 -- Workshop on Challenges &#x0026; Perspectives in Creating Large Language Models</conf-name><conf-date>May 22-27, 2022</conf-date><pub-id pub-id-type="doi">10.18653/v1/2022.bigscience-1.9</pub-id></nlm-citation></ref><ref id="ref38"><label>38</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Touvron</surname><given-names>H</given-names> </name><name name-style="western"><surname>Martin</surname><given-names>L</given-names> </name><name name-style="western"><surname>Stone</surname><given-names>K</given-names> </name><etal/></person-group><article-title>Llama 2: open foundation and fine-tuned chat models</article-title><source>arXiv</source><comment>Preprint posted online on  Jul 18, 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2307.09288</pub-id></nlm-citation></ref><ref id="ref39"><label>39</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Jiang</surname><given-names>AQ</given-names> </name><name name-style="western"><surname>Sablayrolles</surname><given-names>A</given-names> </name><name name-style="western"><surname>Mensch</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Mistral 7B</article-title><source>arXiv</source><comment>Preprint posted online on  Oct 10, 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2310.06825</pub-id></nlm-citation></ref><ref id="ref40"><label>40</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Kudo</surname><given-names>T</given-names> </name><name name-style="western"><surname>Richardson</surname><given-names>J</given-names> </name></person-group><article-title>SentencePiece: a simple and language independent subword tokenizer and detokenizer for neural text processing</article-title><year>2018</year><conf-name>Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing</conf-name><conf-date>Oct 31 to Nov 4, 2018</conf-date><conf-loc>Brussels, Belgium</conf-loc><fpage>66</fpage><lpage>71</lpage><pub-id pub-id-type="doi">10.18653/v1/D18-2012</pub-id></nlm-citation></ref><ref id="ref41"><label>41</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Hu</surname><given-names>EJ</given-names> </name><name name-style="western"><surname>Shen</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Wallis</surname><given-names>P</given-names> </name><etal/></person-group><article-title>LoRA: low-rank adaptation of large language models</article-title><source>arXiv</source><comment>Preprint posted online on  Jun 17, 2021</comment><pub-id pub-id-type="doi">10.48550/arXiv.2106.09685</pub-id></nlm-citation></ref><ref id="ref42"><label>42</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Duki&#x0107;</surname><given-names>D</given-names> </name><name name-style="western"><surname>Snajder</surname><given-names>J</given-names> </name></person-group><article-title>Looking right is sometimes right: investigating the capabilities of decoder-only LLMs for sequence labeling</article-title><conf-name>Findings of the Association for Computational Linguistics ACL 2024</conf-name><conf-date>Aug 11-16, 2024</conf-date><pub-id pub-id-type="doi">10.18653/v1/2024.findings-acl.843</pub-id></nlm-citation></ref><ref id="ref43"><label>43</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Kuki&#x0107;</surname><given-names>ML</given-names> </name><name name-style="western"><surname>&#x010C;uljak</surname><given-names>M</given-names> </name><name name-style="western"><surname>Duki&#x0107;</surname><given-names>D</given-names> </name><name name-style="western"><surname>Tutek</surname><given-names>M</given-names> </name><name name-style="western"><surname>&#x0160;najder</surname><given-names>J</given-names> </name></person-group><article-title>Sequence repetition enhances token embeddings and improves sequence labeling with decoder-only language models</article-title><year>2026</year><conf-name>Findings of the Association for Computational Linguistics</conf-name><conf-date>Mar 24-29, 2026</conf-date><pub-id pub-id-type="doi">10.18653/v1/2026.findings-eacl.339</pub-id></nlm-citation></ref><ref id="ref44"><label>44</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Jiang</surname><given-names>J</given-names> </name><name name-style="western"><surname>Huang</surname><given-names>J</given-names> </name><name name-style="western"><surname>Aizawa</surname><given-names>A</given-names> </name></person-group><article-title>JMedBench: a benchmark for evaluating Japanese biomedical large language models</article-title><year>2025</year><access-date>2026-04-28</access-date><conf-name>Proceedings of the 31st International Conference on Computational Linguistics</conf-name><conf-date>Jan 19-24, 2025</conf-date><comment><ext-link ext-link-type="uri" xlink:href="https://aclanthology.org/2025.coling-main.395/">https://aclanthology.org/2025.coling-main.395/</ext-link></comment></nlm-citation></ref><ref id="ref45"><label>45</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Kaplan</surname><given-names>J</given-names> </name><name name-style="western"><surname>McCandlish</surname><given-names>S</given-names> </name><name name-style="western"><surname>Henighan</surname><given-names>T</given-names> </name><etal/></person-group><article-title>Scaling laws for neural language models</article-title><source>arXiv</source><comment>Preprint posted online on  Jan 23, 2020</comment><pub-id pub-id-type="doi">10.48550/arXiv.2001.08361</pub-id></nlm-citation></ref><ref id="ref46"><label>46</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Nazi</surname><given-names>ZA</given-names> </name><name name-style="western"><surname>Peng</surname><given-names>W</given-names> </name></person-group><article-title>Large language models in healthcare and medical domain: a review</article-title><source>Informatics (MDPI)</source><year>2024</year><volume>11</volume><issue>3</issue><fpage>57</fpage><pub-id pub-id-type="doi">10.3390/informatics11030057</pub-id></nlm-citation></ref><ref id="ref47"><label>47</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Jeong</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Kim</surname><given-names>E</given-names> </name></person-group><article-title>SciDeBERTa: learning DeBERTa for science technology documents and fine-tuning information extraction tasks</article-title><source>IEEE Access</source><year>2022</year><volume>10</volume><issue>10</issue><fpage>60805</fpage><lpage>60813</lpage><pub-id pub-id-type="doi">10.1109/ACCESS.2022.3180830</pub-id></nlm-citation></ref><ref id="ref48"><label>48</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Luo</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Xi</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Ma</surname><given-names>J</given-names> </name><etal/></person-group><article-title>DecBERT: enhancing the language understanding of BERT with causal attention masks</article-title><year>2022</year><conf-name>Findings of the Association for Computational Linguistics</conf-name><conf-date>Jul 10-15, 2022</conf-date><conf-loc>Seattle, United States</conf-loc><publisher-name>Association for Computational Linguistics</publisher-name><fpage>1185</fpage><lpage>1197</lpage><pub-id pub-id-type="doi">10.18653/v1/2022.findings-naacl.89</pub-id></nlm-citation></ref><ref id="ref49"><label>49</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kim</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Kim</surname><given-names>JH</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>JM</given-names> </name><etal/></person-group><article-title>A pre-trained BERT for Korean medical natural language processing</article-title><source>Sci Rep</source><year>2022</year><volume>12</volume><issue>1</issue><pub-id pub-id-type="doi">10.1038/s41598-022-17806-8</pub-id></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Extended experimental details and evaluation results that support the main findings. Contents include (1) overall and case-level performance of the 17 encoder or decoder models on the J-CaseMap and MRNER disease datasets under 4 scoring metrics (weighted soft matching, marker matching, character segment, and token segment scores), (2) pretraining and fine-tuning hyperparameters for JMedDeBERTa(s) or JMedDeBERTa(c) and all comparison models, (3) training and validation loss curves, (4) input-output agreement analysis for the instruction-tuned decoder, (5) gold entity length distribution and length-stratified performance, and (6) representative examples illustrating discrepancies between model predictions and reference annotations.</p><media xlink:href="jmir_v28i1e78681_app1.docx" xlink:title="DOCX File, 10144 KB"/></supplementary-material></app-group></back></article>