<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article article-type="research-article" dtd-version="2.0" xmlns:xlink="http://www.w3.org/1999/xlink">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JMIR</journal-id>
      <journal-id journal-id-type="nlm-ta">J Med Internet Res</journal-id>
      <journal-title>Journal of Medical Internet Research</journal-title>
      <issn pub-type="epub">1438-8871</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
      <article-id pub-id-type="publisher-id">v26i1e54985</article-id>
      <article-id pub-id-type="pmid">39255016</article-id>
      <article-id pub-id-type="doi">10.2196/54985</article-id>
      <article-categories>
        <subj-group subj-group-type="heading">
          <subject>Original Paper</subject>
        </subj-group>
        <subj-group subj-group-type="article-type">
          <subject>Original Paper</subject>
        </subj-group>
      </article-categories>
      <title-group>
        <article-title>The Diagnostic Ability of GPT-3.5 and GPT-4.0 in Surgery: Comparative Analysis</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="editor">
          <name>
            <surname>Cahill</surname>
            <given-names>Naomi</given-names>
          </name>
        </contrib>
        <contrib contrib-type="editor">
          <name>
            <surname>Eysenbach</surname>
            <given-names>Gunther</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Zhu</surname>
            <given-names>Lingxuan</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Jinhui</surname>
            <given-names>Liu</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib id="contrib1" contrib-type="author" equal-contrib="yes">
          <name name-style="western">
            <surname>Liu</surname>
            <given-names>Jiayu</given-names>
          </name>
          <degrees>MD, PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-6976-4568</ext-link>
        </contrib>
        <contrib id="contrib2" contrib-type="author" equal-contrib="yes">
          <name name-style="western">
            <surname>Liang</surname>
            <given-names>Xiuting</given-names>
          </name>
          <degrees>MD</degrees>
          <xref rid="aff2" ref-type="aff">2</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0009-0008-5029-1500</ext-link>
        </contrib>
        <contrib id="contrib3" contrib-type="author" equal-contrib="yes">
          <name name-style="western">
            <surname>Fang</surname>
            <given-names>Dandong</given-names>
          </name>
          <degrees>MD</degrees>
          <xref rid="aff3" ref-type="aff">3</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0009-0009-7712-9826</ext-link>
        </contrib>
        <contrib id="contrib4" contrib-type="author" equal-contrib="yes">
          <name name-style="western">
            <surname>Zheng</surname>
            <given-names>Jiqi</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff4" ref-type="aff">4</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0009-0001-9070-9894</ext-link>
        </contrib>
        <contrib id="contrib5" contrib-type="author" equal-contrib="yes">
          <name name-style="western">
            <surname>Yin</surname>
            <given-names>Chengliang</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff5" ref-type="aff">5</xref>
          <xref rid="aff6" ref-type="aff">6</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-8262-5749</ext-link>
        </contrib>
        <contrib id="contrib6" contrib-type="author" equal-contrib="yes">
          <name name-style="western">
            <surname>Xie</surname>
            <given-names>Hui</given-names>
          </name>
          <degrees>MD</degrees>
          <xref rid="aff7" ref-type="aff">7</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-6219-1696</ext-link>
        </contrib>
        <contrib id="contrib7" contrib-type="author">
          <name name-style="western">
            <surname>Li</surname>
            <given-names>Yanteng</given-names>
          </name>
          <degrees>MD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-0613-8898</ext-link>
        </contrib>
        <contrib id="contrib8" contrib-type="author">
          <name name-style="western">
            <surname>Sun</surname>
            <given-names>Xiaochun</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff5" ref-type="aff">5</xref>
          <xref rid="aff6" ref-type="aff">6</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0009-0001-5291-5742</ext-link>
        </contrib>
        <contrib id="contrib9" contrib-type="author">
          <name name-style="western">
            <surname>Tong</surname>
            <given-names>Yue</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff5" ref-type="aff">5</xref>
          <xref rid="aff6" ref-type="aff">6</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0009-0009-8163-8367</ext-link>
        </contrib>
        <contrib id="contrib10" contrib-type="author">
          <name name-style="western">
            <surname>Che</surname>
            <given-names>Hebin</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff5" ref-type="aff">5</xref>
          <xref rid="aff6" ref-type="aff">6</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0009-0007-3060-0014</ext-link>
        </contrib>
        <contrib id="contrib11" contrib-type="author">
          <name name-style="western">
            <surname>Hu</surname>
            <given-names>Ping</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff5" ref-type="aff">5</xref>
          <xref rid="aff6" ref-type="aff">6</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0009-0004-4820-5360</ext-link>
        </contrib>
        <contrib id="contrib12" contrib-type="author">
          <name name-style="western">
            <surname>Yang</surname>
            <given-names>Fan</given-names>
          </name>
          <degrees>MD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0009-0003-5790-0907</ext-link>
        </contrib>
        <contrib id="contrib13" contrib-type="author">
          <name name-style="western">
            <surname>Wang</surname>
            <given-names>Bingxian</given-names>
          </name>
          <degrees>MD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0009-0000-5071-9441</ext-link>
        </contrib>
        <contrib id="contrib14" contrib-type="author">
          <name name-style="western">
            <surname>Chen</surname>
            <given-names>Yuanyuan</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff5" ref-type="aff">5</xref>
          <xref rid="aff6" ref-type="aff">6</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0009-0003-0330-8270</ext-link>
        </contrib>
        <contrib id="contrib15" contrib-type="author">
          <name name-style="western">
            <surname>Cheng</surname>
            <given-names>Gang</given-names>
          </name>
          <degrees>MD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0009-0003-5341-5808</ext-link>
        </contrib>
        <contrib id="contrib16" contrib-type="author" corresp="yes">
          <name name-style="western">
            <surname>Zhang</surname>
            <given-names>Jianning</given-names>
          </name>
          <degrees>MD, PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <address>
            <institution>Department of Neurosurgery</institution>
            <institution>The First Medical Centre</institution>
            <institution>Chinese PLA General Hospital</institution>
            <addr-line>No. 28 Fuxing Road</addr-line>
            <addr-line>Haidian District</addr-line>
            <addr-line>Beijing, 100853</addr-line>
            <country>China</country>
            <phone>86 01066938439</phone>
            <email>jnzhang2018@163.com</email>
          </address>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-1943-7045</ext-link>
        </contrib>
      </contrib-group>
      <aff id="aff1">
        <label>1</label>
        <institution>Department of Neurosurgery</institution>
        <institution>The First Medical Centre</institution>
        <institution>Chinese PLA General Hospital</institution>
        <addr-line>Beijing</addr-line>
        <country>China</country>
      </aff>
      <aff id="aff2">
        <label>2</label>
        <institution>Department of Respiratory and Critical Care Medicine</institution>
        <institution>The First Medical Centre</institution>
        <institution>Chinese PLA General Hospital</institution>
        <addr-line>Beijing</addr-line>
        <country>China</country>
      </aff>
      <aff id="aff3">
        <label>3</label>
        <institution>Department of Neurosurgery</institution>
        <institution>Sanmenxia Central Hospital</institution>
        <addr-line>Sanmenxia</addr-line>
        <country>China</country>
      </aff>
      <aff id="aff4">
        <label>4</label>
        <institution>School of Health Humanities</institution>
        <institution>Peking University</institution>
        <addr-line>Beijing</addr-line>
        <country>China</country>
      </aff>
      <aff id="aff5">
        <label>5</label>
        <institution>Medical Innovation Research Division</institution>
        <institution>Chinese People's Liberation Army General Hospital</institution>
        <addr-line>Beijing</addr-line>
        <country>China</country>
      </aff>
      <aff id="aff6">
        <label>6</label>
        <institution>National Engineering Research Center for Medical Big Data Application Technology</institution>
        <institution>Chinese People's Liberation Army General Hospital</institution>
        <addr-line>Beijing</addr-line>
        <country>China</country>
      </aff>
      <aff id="aff7">
        <label>7</label>
        <institution>Departments of Urology</institution>
        <institution>The First Affiliated Hospital of Fujian Medical University</institution>
        <addr-line>Fuzhou</addr-line>
        <country>China</country>
      </aff>
      <author-notes>
        <corresp>Corresponding Author: Jianning Zhang <email>jnzhang2018@163.com</email></corresp>
      </author-notes>
      <pub-date pub-type="collection">
        <year>2024</year>
      </pub-date>
      <pub-date pub-type="epub">
        <day>10</day>
        <month>9</month>
        <year>2024</year>
      </pub-date>
      <volume>26</volume>
      <elocation-id>e54985</elocation-id>
      <history>
        <date date-type="received">
          <day>29</day>
          <month>11</month>
          <year>2023</year>
        </date>
        <date date-type="rev-request">
          <day>30</day>
          <month>1</month>
          <year>2024</year>
        </date>
        <date date-type="rev-recd">
          <day>5</day>
          <month>2</month>
          <year>2024</year>
        </date>
        <date date-type="accepted">
          <day>24</day>
          <month>7</month>
          <year>2024</year>
        </date>
      </history>
      <copyright-statement>©Jiayu Liu, Xiuting Liang, Dandong Fang, Jiqi Zheng, Chengliang Yin, Hui Xie, Yanteng Li, Xiaochun Sun, Yue Tong, Hebin Che, Ping Hu, Fan Yang, Bingxian Wang, Yuanyuan Chen, Gang Cheng, Jianning Zhang. Originally published in the Journal of Medical Internet Research (https://www.jmir.org), 10.09.2024.</copyright-statement>
      <copyright-year>2024</copyright-year>
      <license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
        <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (https://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in the Journal of Medical Internet Research (ISSN 1438-8871), is properly cited. The complete bibliographic information, a link to the original publication on https://www.jmir.org/, as well as this copyright and license information must be included.</p>
      </license>
      <self-uri xlink:href="https://www.jmir.org/2024/1/e54985" xlink:type="simple"/>
            <related-article related-article-type="commentary" id="v26i1e66453" ext-link-type="doi" xlink:href="10.2196/66453" vol="26" page="e66453" xlink:type="simple">http://jmir.org/2024/1/e66453/</related-article>
      <abstract>
        <sec sec-type="background">
          <title>Background</title>
          <p>ChatGPT (OpenAI) has shown great potential in clinical diagnosis and could become an excellent auxiliary tool in clinical practice. This study investigates and evaluates ChatGPT in diagnostic capabilities by comparing the performance of GPT-3.5 and GPT-4.0 across model iterations.</p>
        </sec>
        <sec sec-type="objective">
          <title>Objective</title>
          <p>This study aims to evaluate the precise diagnostic ability of GPT-3.5 and GPT-4.0 for colon cancer and its potential as an auxiliary diagnostic tool for surgeons and compare the diagnostic accuracy rates between GTP-3.5 and GPT-4.0. We precisely assess the accuracy of primary and secondary diagnoses and analyze the causes of misdiagnoses in GPT-3.5 and GPT-4.0 according to 7 categories: patient histories, symptoms, physical signs, laboratory examinations, imaging examinations, pathological examinations, and intraoperative findings.</p>
        </sec>
        <sec sec-type="methods">
          <title>Methods</title>
          <p>We retrieved 316 case reports for intestinal cancer from the Chinese Medical Association Publishing House database, of which 286 cases were deemed valid after data cleansing. The cases were translated from Mandarin to English and then input into GPT-3.5 and GPT-4.0 using a simple, direct prompt to elicit primary and secondary diagnoses. We conducted a comparative study to evaluate the diagnostic accuracy of GPT-4.0 and GPT-3.5. Three senior surgeons from the General Surgery Department, specializing in Colorectal Surgery, assessed the diagnostic information at the Chinese PLA (People’s Liberation Army) General Hospital. The accuracy of primary and secondary diagnoses was scored based on predefined criteria. Additionally, we analyzed and compared the causes of misdiagnoses in both models according to 7 categories: patient histories, symptoms, physical signs, laboratory examinations, imaging examinations, pathological examinations, and intraoperative findings.</p>
        </sec>
        <sec sec-type="results">
          <title>Results</title>
          <p>Out of 286 cases, GPT-4.0 and GPT-3.5 both demonstrated high diagnostic accuracy for primary diagnoses, but the accuracy rates of GPT-4.0 were significantly higher than GPT-3.5 (mean 0.972, SD 0.137 vs mean 0.855, SD 0.335; <italic>t</italic><sub>285</sub>=5.753; <italic>P</italic>&lt;.001). For secondary diagnoses, the accuracy rates of GPT-4.0 were also significantly higher than GPT-3.5 (mean 0.908, SD 0.159 vs mean 0.617, SD 0.349; <italic>t</italic><sub>285</sub>=–7.727; <italic>P</italic>&lt;.001). GPT-3.5 showed limitations in processing patient history, symptom presentation, laboratory tests, and imaging data. While GPT-4.0 improved upon GPT-3.5, it still has limitations in identifying symptoms and laboratory test data. For both primary and secondary diagnoses, there was no significant difference in accuracy related to age, gender, or system group between GPT-4.0 and GPT-3.5.</p>
        </sec>
        <sec sec-type="conclusions">
          <title>Conclusions</title>
          <p>This study demonstrates that ChatGPT, particularly GPT-4.0, possesses significant diagnostic potential, with GPT-4.0 exhibiting higher accuracy than GPT-3.5. However, GPT-4.0 still has limitations, particularly in recognizing patient symptoms and laboratory data, indicating a need for more research in real-world clinical settings to enhance its diagnostic capabilities.</p>
        </sec>
      </abstract>
      <kwd-group>
        <kwd>ChatGPT</kwd>
        <kwd>accuracy rates</kwd>
        <kwd>artificial intelligence</kwd>
        <kwd>diagnosis</kwd>
        <kwd>surgeon</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <sec sec-type="introduction">
      <title>Introduction</title>
      <p>Artificial intelligence (AI) can potentially improve health care outcomes as a tool in various fields of medicine [<xref ref-type="bibr" rid="ref1">1</xref>]. While its role in medicine has thus far been limited to specific cases, it has not been widely implemented in clinical practice. Recently, OpenAI released an AI-powered ChatGPT for public use [<xref ref-type="bibr" rid="ref2">2</xref>]. ChatGPT is a powerful neural network model belonging to the GPT family of large language models (LLMs). Despite being created primarily for humanlike conversations, ChatGPT has shown remarkable versatility and has the potential to revolutionize many industries [<xref ref-type="bibr" rid="ref3">3</xref>]. Its release has garnered widespread attention for potential use in numerous fields, especially in medicine [<xref ref-type="bibr" rid="ref4">4</xref>-<xref ref-type="bibr" rid="ref6">6</xref>].</p>
      <p>Recently, GPT-4.0, an upgraded version of GPT-3.5, demonstrated remarkable improvements in professional and academic benchmarks, including the US Medical Licensing Examination [<xref ref-type="bibr" rid="ref7">7</xref>]. We previously used ChatGPT to answer medical questions and compared ChatGPT’s abilities with evidence-based neurosurgeons [<xref ref-type="bibr" rid="ref8">8</xref>]. Unlike low-seniority surgeons, ChatGPT has no lack of foundational knowledge and is not limited by cognitive load, especially GPT-4.0, which was comparable to that of surgeons with high seniority [<xref ref-type="bibr" rid="ref8">8</xref>]. It also showed that ChatGPT generates semantically accurate responses to text-based questions, including medical textual data, making it a highly potential candidate for clinical decision support (CDS) applications.</p>
      <p>Colorectal cancer is a type of cancer that affects the colon (large intestine) or rectum [<xref ref-type="bibr" rid="ref9">9</xref>], which is the third most common cancer and the second leading cause of cancer-related deaths worldwide, accounting for approximately 10% of all cancer cases [<xref ref-type="bibr" rid="ref9">9</xref>]. Colon cancer is a kind of disease with very obvious symptoms including diarrhea, constipation, blood in the stool, abdominal pain, unexplained weight loss, fatigue, and low iron levels [<xref ref-type="bibr" rid="ref10">10</xref>]. A diagnosis can be made once a surgeon recognizes that a patient has these symptoms.</p>
      <p>Surgeons usually use clinical experience to make diagnoses based on patient’s symptoms, examination results, medical histories, physical signs, and intraoperative findings. This process will consume much time to occupy the workforce and will be subjective. Misdiagnoses or delays in diagnosis may occur. Innovative approaches to improve diagnostic accuracy and efficiency are needed. Many AI models have been used in health care to aid CDS [<xref ref-type="bibr" rid="ref11">11</xref>-<xref ref-type="bibr" rid="ref13">13</xref>]. However, few studies still exist on how ChatGPT can read and analyze clinical case text data and provide diagnoses based on real-world clinical information obtained at the hospital.</p>
      <p>This study investigates the use of ChatGPT for diagnosis and assesses its performance. Furthermore, substantial performance improvement indicates that LLMs are quickly approaching readiness for use in the clinical setting. Continuous evaluation is necessary to keep pace with model progress. Therefore, we also compared the performance of two versions of ChatGPT (GPT-4.0 and GPT-3.5) to understand the impact of model updates on performance.</p>
    </sec>
    <sec sec-type="methods">
      <title>Methods</title>
      <sec>
        <title>Study Design</title>
        <p>We designed a comparative study to evaluate the applicability of ChatGPT (GPT-4.0 and GPT-3.5) in surgery using clinical information from the case database of the Chinese Medical Association Publishing House (CMAPH). GPT-3.5 is a free, widely accessible platform, while GPT-4.0 is a newer release claiming better performance, allowing us to evaluate improvements in GPT-4.0 comprehensively. The CMAPH database is a comprehensive and authoritative resource. It provides access to current and archived issues across various medical specialties, making it invaluable for researchers, clinicians, and health care professionals [<xref ref-type="bibr" rid="ref14">14</xref>]. This study used colorectal cancer–related literature from this database, leveraging its extensive repository to support our analysis.</p>
      </sec>
      <sec>
        <title>Study Data</title>
        <p>A total of 316 case reports for intestinal cancer between August 1982 and June 2023 were retrieved from the CMAPH database by a researcher (CY); then, FY extracted the case records from the literature and designed a standardized data organization form by Excel (Microsoft Corp), which included extracted case records, three response records from GPT-3.5 and GPT-4.0, an evaluation section for the responses from GPT-3.5 and GPT-4.0, and a detailed assessment of seven categories of misdiagnoses. Another researcher (BW) then conducted a thorough data cleansing process, removing cases with incomplete information, excluding diagnosis-related data from the literature, and translating the data according to clinical terminology. JL was responsible for verifying the translation results. We applied the World Health Organization’s <italic>ICD-11</italic> (<italic>International Classification of Diseases, 11th Revision</italic>) diagnostic criteria [<xref ref-type="bibr" rid="ref15">15</xref>,<xref ref-type="bibr" rid="ref16">16</xref>] to these cases, excluding cases where the diagnosis and related information conflicted with the <italic>ICD-11</italic> standards. This process resulted in a final count of 286 valid cases for analysis. The collected data included age, gender, medical histories, symptoms, physical signs, laboratory examinations, imaging examinations, pathological examinations, and intraoperative findings, all structured as text data. Patient identities were kept confidential and anonymized for the analysis.</p>
      </sec>
      <sec>
        <title>Prompt Design and Input Strategies</title>
        <p>We aimed for GPT-3.5 and GPT-4.0 to provide appropriate primary and secondary diagnoses based on the case information from the literature data. Therefore, we designed the prompt in English, “Please provide the most likely primary and secondary diagnoses,” and conducted tests accordingly. We randomly selected 5 of the 286 cases for initial testing, using the designed prompt to query GPT-3.5 and GPT-4.0 thrice each. It was a success if both models provided diagnoses without refusing to answer. JZ conducted these tests, and the answers obtained were recorded in the standardized data form. Subsequently, these 5 cases were not queried again. During these tests, both GPT-3.5 and GPT-4.0 successfully provided answers as required. Consequently, we finalized this prompt for use in this study. A researcher (JZ) used the same computer device to query GPT-3.5 and GPT-4.0 from August 1, 2023, to November 1, 2023, while another researcher (YT) was responsible for proofreading and verification.</p>
        <p>Due to the potential for inconsistent responses from ChatGPT when the same question is asked multiple times, we posed the prompt for each case three times to GPT-3.5 and GPT-4.0. In this study, the prompt for each case was first posed three times using GPT-3.5, followed by three times using GPT-4.0, without providing any feedback to either model after each inquiry. To avoid any contextual learning that could affect the accuracy of the responses, each inquiry to GPT-3.5 and GPT-4.0 was conducted in a new chat session. The mean accuracy rate represents the average accuracy derived from these three inquiries for each model. We conducted the Fleiss κ test on the primary and secondary diagnostic responses provided by GPT-3.5 and GPT-4.0, defining consistency as the ability to provide the same information with each submission.</p>
      </sec>
      <sec>
        <title>Data Measurement</title>
        <p>The accuracy of GPT-3.5 and GPT-4.0 in providing primary and secondary diagnoses was evaluated based on explicit diagnoses in published literature. Diagnoses that matched the explicit diagnoses in the literature were recorded as “correct,” while those that did not were recorded as “incorrect.” An accuracy rate exceeding 0.800 for GPT-4.0 was deemed acceptable, while an accuracy rate exceeding 0.600 for GPT-3.5 was considered acceptable [<xref ref-type="bibr" rid="ref17">17</xref>-<xref ref-type="bibr" rid="ref19">19</xref>].</p>
        <p>We assessed the diagnostic errors made by GPT-3.5 and GPT-4.0, categorizing them into 7 types: medical histories, symptoms, physical signs, laboratory examinations, imaging examinations, pathological examinations, and intraoperative findings. Due to the literature case records containing these 7 aspects, we discussed and decided to categorize the diagnostic errors accordingly. The criteria for each category are as follows: (1) medical histories: past medical conditions, treatments, surgeries, and other relevant health information; (2) symptoms: subjective indications reported by the patient, such as pain, fever, changes in bowel habits, rectal bleeding, abdominal pain, unexpected weight loss, and fatigue; (3) physical signs: objective observations during a physical examination including abdominal tenderness, palpable masses, anemia, or ascites; (4) laboratory examination: analysis of biological samples including blood chemistry, hematology, urinalysis, complete blood count for anemia, stool tests for occult blood, and tumor markers like carcinoembryonic antigen; (5) imaging examination: techniques to create internal body images such as x-rays, computed tomography scans, magnetic resonance imaging, ultrasound, and barium enemas; (6) pathological examination: microscopic analysis of tissue samples to determine cancer type and differentiation; and (7) intraoperative findings: observations during surgery regarding tumor location, size, extent, and involvement of adjacent organs or lymph nodes. For instance, if a diagnosis is primarily based on the patient’s medical history and ChatGPT’s response is incorrect, the error pattern is classified as a history-based error. If the diagnosis derives from multiple sources and ChatGPT provides an incorrect answer, we categorize all relevant sources as contributing to the error pattern.</p>
        <p>We selected 3 senior surgeons not involved in this study to independently evaluate the diagnostic information and determine the primary and secondary diagnoses. All three senior surgeons are from the General Surgery Department, specializing in Colorectal Surgery, at the Chinese PLA (People’s Liberation Army) General Hospital, each with at least 20 years of clinical experience. The National Health Commission of the People’s Republic of China defines senior surgeons as chief physicians with over 3 years of experience as associate chief physicians in surgery. The primary and secondary diagnoses provided by GPT-3.5 and GPT-4.0 were scored based on the previously mentioned accuracy assessment criteria. For the primary diagnosis, responses were scored as “correct” with a value of 1 and “incorrect” with a value of 0. For secondary diagnoses, the scoring was based on the ratio of the number of correct secondary diagnoses provided by ChatGPT to the total number of correct secondary diagnoses in the literature. For example, if there were five correct secondary diagnoses and ChatGPT provided three correct ones, the score would be 3/5. The diagnostic errors of GPT-3.5 and GPT-4.0 were categorized according to the error mentioned above pattern classification criteria. Errors involving a category were assigned a value of 1, while noninvolvement was assigned a value of 0. The three senior surgeons used the standardized data recording form to document their evaluations, and discrepancies were discussed in meetings held at the hospital to reach a consensus on the scoring methodology to ensure consistency and accuracy in the evaluation process, with interrater agreement assessed using Cohen κ statistic.</p>
      </sec>
      <sec>
        <title>Statistical Analysis</title>
        <p>The data were normally distributed as determined by the Kolmogorov-Smirnov test. We analyzed data using SPSS (version 25.0; IBM Corp). We used mean (SD) values to represent numerical variables and describe qualitative variables using absolute values of special group cases. The statistical methods are mainly based on <italic>t</italic> tests, and repeated measures analysis of variance is used for statistical analysis. The difference is statistically significant with <italic>P</italic>&lt;.05.</p>
      </sec>
      <sec>
        <title>Ethical Considerations</title>
        <p>No ethical approval or informed consent was required for this study, as it used publicly available data. All study designs strictly adhere to the Guidelines and Checklist for the Reporting on Digital Health Implementations checklist [<xref ref-type="bibr" rid="ref20">20</xref>]. This study does not use AI to generate any related content.</p>
      </sec>
    </sec>
    <sec sec-type="results">
      <title>Results</title>
      <sec>
        <title>Baseline Characteristics</title>
        <p>Detailed questions and responses are included in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> (Figure S1 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). Cohen κ ranged between moderate to near perfect agreement (<xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>). For primary diagnoses, the mean accuracy rates of GPT-4.0 were 0.972 (SD 0.137), and GPT-3.5 were 0.855 (SD 0.335). The accuracy rates of GPT-4.0 were significantly higher than GPT-3.5 (<italic>t</italic><sub>285</sub>=5.753; <italic>P</italic>&lt;.001). For secondary diagnoses, the mean accuracy rate of GPT-4.0 was 0.908 (SD 0.159), and GPT-3.5 was 0.617 (SD 0.349). The accuracy rates of GPT-4.0 were also significantly higher than GPT-3.5 (<italic>t</italic><sub>285</sub>=–7.727; <italic>P</italic>&lt;.001; <xref ref-type="table" rid="table1">Table 1</xref> and <xref rid="figure1" ref-type="fig">Figure 1</xref>A).</p>
        <p>We solved any discrepancies. Of the 286 cases, 19 cases exhibited inconsistencies, while 267 cases were consistent, resulting in an agreement rate of 93.36%. The discrepancy rate stands at 6.64%. The three senior surgeons demonstrated excellent agreement in their assessments (<xref ref-type="supplementary-material" rid="app3">Multimedia Appendix 3</xref>). We analyzed the ChatGPT performance according to error patterns. The rates of diagnosis errors due to medical histories in GPT-4.0 were significantly lower than in GPT-3.5 (mean 0.13, SD 0.520 vs mean 0.76, SD 1.446; <italic>t</italic><sub>285</sub>=–3.384; <italic>P</italic>&lt;.001). The rates of diagnosis errors due to symptoms in GPT-4.0 were significantly lower than in GPT-3.5 (mean 0.33, SD 0.877 vs mean 0.55, SD 1.125; <italic>t</italic><sub>285</sub>=–3.840; <italic>P</italic>=.001). The rates of diagnosis errors due to physical signs in GPT-4.0 were significantly lower than in GPT-3.5 (mean 0.12, SD 0.516 vs mean 0.28, SD 0.867; <italic>t</italic><sub>285</sub>=–5.959; <italic>P</italic>&lt;.001). The rates of laboratory examination due to medical histories in GPT-4.0 were significantly lower than in GPT-3.5 (mean 0.17, SD 0.645 vs mean 0.55, SD 1.131; <italic>t</italic><sub>285</sub>=–6.738; <italic>P</italic>&lt;.001). The rates of diagnosis errors due to imaging examination in GPT-4.0 were significantly lower than in GPT-3.5 (mean 0.11, SD 0.483 vs mean 0.54, SD 1.125; <italic>t</italic><sub>285</sub>=–6.846; <italic>P</italic>&lt;.001). The rates of diagnosis errors due to pathological examination in GPT-4.0 were significantly lower than in GPT-3.5 (mean 0.04, SD 0.294 vs mean 0.20, SD 0.739; <italic>t</italic><sub>285</sub>=–3.536; <italic>P</italic>&lt;.001). The rates of diagnosis errors due to imaging examination in GPT-4.0 were significantly lower than in GPT-3.5 (mean 0.08, SD 0.403 vs mean 0.59, SD 1.258; <italic>t</italic><sub>285</sub>=14.006; <italic>P</italic>&lt;.001; <xref ref-type="table" rid="table1">Table 1</xref> and <xref rid="figure1" ref-type="fig">Figure 1</xref>B).</p>
        <table-wrap position="float" id="table1">
          <label>Table 1</label>
          <caption>
            <p>Baseline characteristics.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="30"/>
            <col width="290"/>
            <col width="0"/>
            <col width="180"/>
            <col width="0"/>
            <col width="180"/>
            <col width="0"/>
            <col width="200"/>
            <col width="0"/>
            <col width="120"/>
            <thead>
              <tr valign="top">
                <td colspan="3">Analyzed pairs</td>
                <td colspan="2">GTP-4.0, mean (SD)</td>
                <td colspan="2">GTP-3.5, mean (SD)</td>
                <td colspan="2"><italic>t</italic> test (<italic>df</italic>)</td>
                <td><italic>P</italic> value</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td colspan="3">Primary diagnoses</td>
                <td colspan="2">0.972 (0.137)</td>
                <td colspan="2">0.855 (0.335)</td>
                <td colspan="2">5.753 (285)</td>
                <td>&lt;.001</td>
              </tr>
              <tr valign="top">
                <td colspan="3">Secondary diagnoses</td>
                <td colspan="2">0.908 (0.159)</td>
                <td colspan="2">0.617 (0.349)</td>
                <td colspan="2">–7.727 (285)</td>
                <td>&lt;.001</td>
              </tr>
              <tr valign="top">
                <td colspan="10">
                  <bold>Error patterns</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Medical histories</td>
                <td colspan="2">0.13 (0.520)</td>
                <td colspan="2">0.76 (1.446)</td>
                <td colspan="2">–3.384 (285)</td>
                <td colspan="2">&lt;.001</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Symptoms</td>
                <td colspan="2">0.33 (0.877)</td>
                <td colspan="2">0.55 (1.125)</td>
                <td colspan="2">–3.840 (285)</td>
                <td colspan="2">.001</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Physical signs</td>
                <td colspan="2">0.12 (0.516)</td>
                <td colspan="2">0.28 (0.867)</td>
                <td colspan="2">–5.959 (285)</td>
                <td colspan="2">&lt;.001</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Laboratory examination</td>
                <td colspan="2">0.17 (0.645)</td>
                <td colspan="2">0.55 (1.131)</td>
                <td colspan="2">–6.738 (285)</td>
                <td colspan="2">&lt;.001</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Imaging examination</td>
                <td colspan="2">0.11 (0.483)</td>
                <td colspan="2">0.54 (1.125)</td>
                <td colspan="2">–6.846 (285)</td>
                <td colspan="2">&lt;.001</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Pathological examination</td>
                <td colspan="2">0.04 (0.294)</td>
                <td colspan="2">0.20 (0.739)</td>
                <td colspan="2">–3.536 (285)</td>
                <td colspan="2">&lt;.001</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Intraoperative findings</td>
                <td colspan="2">0.08 (0.403)</td>
                <td colspan="2">0.59 (1.258)</td>
                <td colspan="2">14.006 (285)</td>
                <td colspan="2">&lt;.001</td>
              </tr>
            </tbody>
          </table>
        </table-wrap>
        <fig id="figure1" position="float">
          <label>Figure 1</label>
          <caption>
            <p>The comparison of GPT-4.0 and GPT-3.5. (A) The comparison of GPT-4.0 and GPT-3.5 in diagnostic accuracy for the primary and secondary diagnoses. (B) The comparison of GPT-4.0 and GPT-3.5 in error patterns (medical histories, symptoms, physical signs, examination results, and intraoperative findings). (C) The comparison of GPT-4.0 and GPT-3.5 in age, gender, and system group for the primary diagnoses. (D) The comparison of GPT-4.0 and GPT-3.5 in age, gender, and system group for the secondary diagnoses.</p>
          </caption>
          <graphic xlink:href="jmir_v26i1e54985_fig1.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Comparison of Accuracy Rates According to Specialty Classification in Primary Diagnoses</title>
        <p>Regarding age groups, there was no significant difference between ages of &lt;60 years and ≥60 years in GPT-4.0 (mean 0.972, SD 0.137 vs mean 0.972, SD 0.137; <italic>t</italic><sub>284</sub>=0.002; <italic>P</italic>=.99), and also no significant difference between them in GPT-3.5 (mean 0.875, SD 0.318 vs mean 0.832, SD 0.354; <italic>t</italic><sub>284</sub>=1.086, <italic>P</italic>=.28). In the gender group, there was no significant difference between male and female in GPT-4.0 (mean 0.970, SD 0.147 vs mean 0.975, SD 0.119; <italic>t</italic><sub>284</sub>=–0.291; <italic>P</italic>=.77), and also no significant difference between them in GPT-3.5 (mean 0.842, SD 0.345 vs mean 0.879, SD 0.319; <italic>t</italic><sub>284</sub>=–0.898; <italic>P</italic>=.37). In the diagnostic type group, there was no significant difference between “solely intradigestive system” and “both intradigestive and extradigestive system” in GPT-4.0 (mean 0.961, SD 0.166 vs mean 0.979, SD 0.114; <italic>t</italic><sub>284</sub>=–1.063; <italic>P</italic>=.29), and also no significant difference between them in GPT-3.5 (mean 0.863, SD 0.333 vs mean 0.851, SD 0.337; <italic>t</italic><sub>284</sub>=0.308; <italic>P</italic>=.76; <xref ref-type="table" rid="table2">Table 2</xref>). Moreover, there was a significant difference between GPT-4.0 and GPT-3.5 in age, gender, and diagnostic type group (<xref rid="figure1" ref-type="fig">Figure 1</xref>C).</p>
        <table-wrap position="float" id="table2">
          <label>Table 2</label>
          <caption>
            <p>Comparison of accuracy rates according to specialty classification in primary diagnoses.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="30"/>
            <col width="290"/>
            <col width="180"/>
            <col width="180"/>
            <col width="200"/>
            <col width="120"/>
            <col width="0"/>
            <col width="0"/>
            <col width="0"/>
            <thead>
              <tr valign="top">
                <td colspan="2">Analyzed pairs</td>
                <td>GPT-4.0, mean (SD)</td>
                <td>GPT-3.5, mean (SD)</td>
                <td><italic>t</italic> test (<italic>df</italic>)</td>
                <td><italic>P</italic> value</td>
                <td>
                  <break/>
                </td>
                <td colspan="2">
                  <break/>
                </td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td colspan="8">
                  <bold>Age (in years)<sup>a</sup></bold>
                </td>
                <td>
                  <break/>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>&lt;60</td>
                <td>0.972 (0.137)</td>
                <td>0.875 (0.318)</td>
                <td>3.688 (154)</td>
                <td>&lt;.001</td>
                <td>
                  <break/>
                </td>
                <td colspan="2">
                  <break/>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>≥60</td>
                <td>0.972 (0.137)</td>
                <td>0.832 (0.354)</td>
                <td>4.445 (130)</td>
                <td>&lt;.001</td>
                <td>
                  <break/>
                </td>
                <td colspan="2">
                  <break/>
                </td>
              </tr>
              <tr valign="top">
                <td colspan="9">
                  <bold>Sex<sup>b</sup></bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Male</td>
                <td>0.970 (0.147)</td>
                <td>0.842 (0.345)</td>
                <td>3.025 (106)</td>
                <td>.003</td>
                <td>
                  <break/>
                </td>
                <td colspan="2">
                  <break/>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Female</td>
                <td>0.975 (0.119)</td>
                <td>0.879 (0.319)</td>
                <td>4.910 (178)</td>
                <td>&lt;.001</td>
                <td>
                  <break/>
                </td>
                <td colspan="2">
                  <break/>
                </td>
              </tr>
              <tr valign="top">
                <td colspan="8">
                  <bold>Diagnostic type<sup>c</sup></bold>
                </td>
                <td>
                  <break/>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Solely intradigestive system</td>
                <td>0.961 (0.166)</td>
                <td>0.863 (0.333)</td>
                <td>2.983 (111)</td>
                <td>.004</td>
                <td>
                  <break/>
                </td>
                <td colspan="2">
                  <break/>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Both intradigestive and extradigestive system</td>
                <td>0.979 (0.114)</td>
                <td>0.851 (0.337)</td>
                <td>4.991 (173)</td>
                <td>&lt;.001</td>
                <td>
                  <break/>
                </td>
                <td colspan="2">
                  <break/>
                </td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table2fn1">
              <p><sup>a</sup>In the age group, GPT-4.0 <italic>t</italic><sub>284</sub>=0.002 and <italic>P</italic>=.99; GPT-3.5 <italic>t</italic><sub>284</sub>=1.086 and <italic>P</italic>=.28.</p>
            </fn>
            <fn id="table2fn2">
              <p><sup>b</sup>In the gender group, GPT-4.0 <italic>t</italic><sub>284</sub>=–0.291 and <italic>P</italic>=.77; GPT-3.5 <italic>t</italic><sub>284</sub>=–0.898 and <italic>P</italic>=.37.</p>
            </fn>
            <fn id="table2fn3">
              <p><sup>c</sup>In the diagnostic type group, GPT-4.0 <italic>t</italic><sub>284</sub>=–1.063 and <italic>P</italic>=.29; GPT-3.5 <italic>t</italic><sub>284</sub>=0.308 and <italic>P</italic>=.76.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
      </sec>
      <sec>
        <title>Comparison of Accuracy Rates According to Specialty Classification in Secondary Diagnoses</title>
        <p>Regarding age group, there was no significant difference between ages of &lt;60 years and ≥60 years in GPT-4.0 (mean 0.900, SD 0.153 vs mean 0.918, SD 0.165; <italic>t</italic><sub>263</sub>=–0.910; <italic>P</italic>=.36), and also no significant difference between them in GPT-3.5 (mean 0.596, SD 0.349 vs mean 0.639, SD 0.349; <italic>t</italic><sub>264</sub>=–1.006; <italic>P</italic>=.32). In the gender group, there was no significant difference between male and female in GPT-4.0 (mean 0.916, SD 0.152 vs mean 0.896, SD 0.170; <italic>t</italic><sub>263</sub>=0.972; <italic>P</italic>=.33), and also no significant difference between them in GPT-3.5 (mean 0.597, SD 0.352 vs mean 0.646, SD 0.343; <italic>t</italic><sub>264</sub>=–1.114; <italic>P</italic>=.27). In the diagnostic type group, there was no significant difference between “solely intradigestive system” and “both intradigestive and extradigestive system” in GPT-4.0 (mean 0.889, SD 0.185 vs mean 0.921, SD 0.138; <italic>t</italic><sub>263</sub>=–1.569; <italic>P</italic>=.12), and also no significant difference between them in GPT-3.5 (mean 0.607, SD 0.325 vs mean 0.621, SD 0.365; <italic>t</italic><sub>264</sub>=–0.333; <italic>P</italic>=.74; <xref ref-type="table" rid="table3">Table 3</xref>). Moreover, there was a significant difference between GPT-4.0 and GPT-3.5 in age, gender, and diagnostic type groups (<xref ref-type="table" rid="table3">Table 3</xref> and <xref rid="figure1" ref-type="fig">Figure 1</xref>D).</p>
        <table-wrap position="float" id="table3">
          <label>Table 3</label>
          <caption>
            <p>Comparison of accuracy rates according to specialty classification in secondary diagnoses.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="30"/>
            <col width="290"/>
            <col width="180"/>
            <col width="180"/>
            <col width="200"/>
            <col width="120"/>
            <col width="0"/>
            <thead>
              <tr valign="top">
                <td colspan="2">Analyzed pairs</td>
                <td>GPT-4.0, mean (SD)</td>
                <td>GPT-3.5, mean (SD)</td>
                <td><italic>t</italic> test <italic>(df</italic>)</td>
                <td><italic>P</italic> value</td>
                <td>
                  <break/>
                </td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td colspan="7">
                  <bold>Age (in years)<sup>a</sup></bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>&lt;60</td>
                <td>0.900 (0.153)</td>
                <td>0.596 (0.349)</td>
                <td>10.764 (142)</td>
                <td>&lt;.001</td>
                <td>
                  <break/>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>≥60</td>
                <td>0.918 (0.165)</td>
                <td>0.639 (0.349)</td>
                <td>8.974 (121)</td>
                <td>&lt;.001</td>
                <td>
                  <break/>
                </td>
              </tr>
              <tr valign="top">
                <td colspan="7">
                  <bold>Sex<sup>b</sup></bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Male</td>
                <td>0.916 (0.152)</td>
                <td>0.597 (0.352)</td>
                <td>7.902 (99)</td>
                <td>&lt;.001</td>
                <td>
                  <break/>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Female</td>
                <td>0.896 (0.170)</td>
                <td>0.646 (0.343)</td>
                <td>11.613 (164)</td>
                <td>&lt;.001</td>
                <td>
                  <break/>
                </td>
              </tr>
              <tr valign="top">
                <td colspan="7">
                  <bold>Diagnostic type<sup>c</sup></bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Solely intradigestive system</td>
                <td>0.889 (0.185)</td>
                <td>0.607 (0.325)</td>
                <td>8.850 (104)</td>
                <td>&lt;.001</td>
                <td>
                  <break/>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Both intradigestive and extradigestive system</td>
                <td>0.921 (0.138)</td>
                <td>0.621 (0.365)</td>
                <td>10.844 (159)</td>
                <td>&lt;.001</td>
                <td>
                  <break/>
                </td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table3fn1">
              <p><sup>a</sup>In the age group, GPT-4.0 <italic>t</italic><sub>263</sub>=–0.910 and <italic>P</italic>=.36; GPT-3.5 <italic>t</italic><sub>264</sub>=–1.006 and <italic>P</italic>=.32.</p>
            </fn>
            <fn id="table3fn2">
              <p><sup>b</sup>In the gender group, GPT-4.0 <italic>t</italic><sub>263</sub>=0.972 and <italic>P</italic>=.33; GPT-3.5 <italic>t</italic><sub>264</sub>=–1.114 and <italic>P</italic>=.27.</p>
            </fn>
            <fn id="table3fn3">
              <p><sup>c</sup>In the diagnostic type group, GPT-4.0 <italic>t</italic><sub>263</sub>=–1.569 and <italic>P</italic>=.12; GPT-3.5 <italic>t</italic><sub>264</sub>=–0.333 and <italic>P</italic>=.74.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
      </sec>
    </sec>
    <sec sec-type="discussion">
      <title>Discussion</title>
      <sec>
        <title>Principal Findings</title>
        <p>This comparative study evaluated the diagnostic capabilities of ChatGPT, specifically comparing the performance of GPT-4.0 and GPT-3.5 using clinical information from a public case database. Our analysis focused on specialty classification and error patterns including medical histories, symptoms, physical signs, examination results, and intraoperative findings. The results indicated that GPT-4.0 achieved significantly higher accuracy rates for primary and secondary diagnoses than GPT-3.5. Despite improvements, both versions exhibited limitations in processing patient history, symptom presentation, and laboratory test data, with GPT-4.0 showing better performance overall. These findings are consistent with the performance evaluations of GPT-3.5 and GPT-4.0 by Taloni et al [<xref ref-type="bibr" rid="ref21">21</xref>], Yang et al [<xref ref-type="bibr" rid="ref22">22</xref>], Deng et al [<xref ref-type="bibr" rid="ref23">23</xref>], and Antaki et al [<xref ref-type="bibr" rid="ref24">24</xref>]. However, challenges persist in accurately identifying symptoms and interpreting laboratory results. These findings underscore ChatGPT’s potential in medical diagnosis, highlighting the need for further research to enhance its performance in dynamic clinical environments. This study contributes to ongoing efforts to optimize ChatGPT as an auxiliary tool in clinical practice, emphasizing the importance of continual refinement to ensure its efficacy and reliability in supporting health care professionals’ diagnostic decision-making.</p>
        <p>ChatGPT’s ability to read and analyze is helpful in multiple clinical disciplines. Howard et al [<xref ref-type="bibr" rid="ref25">25</xref>] asked ChatGPT for antimicrobial advice and pointed out that despite no specific clinical advice training, ChatGPT provides compelling responses to most prompts. Nastasi et al [<xref ref-type="bibr" rid="ref26">26</xref>] found that ChatGPT is currently useful for providing background knowledge on general clinical topics in an emergency. Training ChatGPT with medical corpora, incorporating clinician-supervised feedback, and enhancing its awareness of uncertainty and information-seeking behaviors may improve the medical advice provided by LLMs. Lukac et al [<xref ref-type="bibr" rid="ref27">27</xref>] evaluated ChatGPT as an adjunct for decision-making in primary breast cancer cases and stated that the eloquence of ChatGPT based on several scientific databases could result in more precise suggestions. Eggmann and Blatz [<xref ref-type="bibr" rid="ref28">28</xref>] analyzed the chances and challenges of using ChatGPT in dentistry, they found that the use of ChatGPT and similar LLMs in dentistry could streamline administrative workflows and potentially serve as an additional tool for CDS in the future. Kuroiwa et al [<xref ref-type="bibr" rid="ref29">29</xref>] discussed that the integration of AI, natural language processing, and GPT technologies holds immense promise in the field of psychiatry, which has the potential to revolutionize the way psychiatric disorders are diagnosed, treated, and monitored. Due to the evident symptoms of colon cancer, such as diarrhea, constipation, and blood in the stool, ChatGPT can provide basic, accurate diagnoses based on textual medical records. Previous studies have demonstrated ChatGPT’s strong ability to read and analyze text data. Therefore, in diagnosing colon cancer, ChatGPT can assist in improving doctors’ work efficiency and reducing the workload of initial diagnoses.</p>
        <p>Previous studies have evaluated the accuracy of ChatGPT from the perspective of patient self-diagnosis [<xref ref-type="bibr" rid="ref29">29</xref>], but this study evaluated its accuracy based on clearly defined case record text data from publicly available literature and found its potential to assist in diagnosis, thereby reducing the time required to achieve accurate diagnosis and improving clinical efficiency. The application of ChatGPT in medical diagnosis offers several noteworthy advantages. We set a priori accuracy rate exceeding 0.800 for GPT-4.0 while a priori accuracy rate exceeding 0.600 for GPT-3.5, according to the previous studies [<xref ref-type="bibr" rid="ref17">17</xref>-<xref ref-type="bibr" rid="ref19">19</xref>]. In this study, we found that GPT-4.0 has a diagnostic accuracy above 0.900, while GPT-3.5 exhibits an accuracy of over 0.800 in the diagnosis of colon cancer. These results also confirmed that the diagnostic accuracy rates of GPT-3.5 and GPT-4.0 reached acceptable a priori accuracy rates, aligning with previous research findings.</p>
        <p>For the primary and secondary diagnoses, the accuracy rates of GPT-4.0 were significantly higher than GPT-3.5. These results suggest significant enhancements in the algorithms and data processing capabilities of GPT-4.0. GPT-4.0 has more advanced neural network architectures, better natural language understanding, and a more extensive and diverse training data set than GPT-3.5, which enables it to handle complex medical diagnostic scenarios effectively [<xref ref-type="bibr" rid="ref30">30</xref>-<xref ref-type="bibr" rid="ref33">33</xref>]. GPT-4.0 better contextualizes information and the enhanced ability to understand and process complex medical terminology queries is crucial in clinical diagnosis [<xref ref-type="bibr" rid="ref23">23</xref>,<xref ref-type="bibr" rid="ref34">34</xref>]. GPT-4.0’s enhanced ability to contextualize information and process complex medical terminology queries has been instrumental in clinical diagnosis, as it provides a deeper understanding of medical context, terminology, symptom presentation, and laboratory data interpretation [<xref ref-type="bibr" rid="ref29">29</xref>]. Based on these considerations, the GPT-3.5 to GPT-4.0 upgrade appears effective.</p>
        <p>In comparing the classification accuracy of the primary diagnosis, we found that GPT4.0 can achieve high accuracy in disease diagnosis across different ages, genders, and diagnostic type groups. The ability of GPT-4.0 to accurately diagnose patients of all ages and genders is crucial. We believe GPT-4.0 can catch the nuance between different age and gender groups, which can exhibit varying symptoms for the same disease. Besides, previous studies have also presented that GPT-4.0 shows its potential and the richness of its use scenarios in diagnosing a range of diseases across different systems. Hasani et al [<xref ref-type="bibr" rid="ref35">35</xref>] evaluated the performance of GPT-4.0 in standardizing radiology reports, they indicated that GPT-4.0 could be a reliable tool for generating standardized radiology reports, offering potential benefits such as improved efficiency, better communication, and simplified data extraction and analysis. Liu et al [<xref ref-type="bibr" rid="ref8">8</xref>] compared the abilities of GPT-4.0 and neurosurgeons, which showed that GPT-4.0’s ability was comparable to that of neurosurgeons with high seniority, and Kanjee et al [<xref ref-type="bibr" rid="ref36">36</xref>] assessed the diagnostic accuracy of the GPT-4.0 in a series of challenging cases and found that generative AI is a promising adjunct to human cognition in diagnosis. This broad range of diagnostic capabilities could be valuable in clinical practice to assist physicians.</p>
        <p>In this study, we categorized the diagnostic errors of GPT-3.5 and GPT-4.0 based on seven aspects covered in the literature: medical histories, symptoms, physical signs, laboratory examinations, imaging examinations, pathological examinations, and intraoperative findings. These 7 aspects are crucial for making definitive diagnoses in clinical practice. Our multidimensional evaluation, therefore, reflects a realistic simulation of clinical application. This approach provides significant clinical value by highlighting the model’s performance across all essential diagnostic dimensions. Such comprehensive assessment ensures that the evaluation of ChatGPT’s diagnostic capabilities is both thorough and relevant to real-world clinical settings. GPT-3.5 has limitations in assessing patient history, symptom presentation, laboratory tests, and imaging data. GPT-3.5 is trained on large data sets, which may not extensively cover specialized medical information, exceptionally detailed patient histories, intricate symptomatology, and complex laboratory or imaging data. GPT-3.5 cannot interpret symptoms, lab results, and imaging data with a nuanced understanding of the patient’s overall clinical picture including history, physical examination, and evolving clinical scenarios as physicians [<xref ref-type="bibr" rid="ref37">37</xref>]. Furthermore, the knowledge of GPT-3.5 is static. It does not learn in real-time or adapt based on new patient information or the latest medical research, unlike physicians who continually update their knowledge and practice [<xref ref-type="bibr" rid="ref38">38</xref>]. The above limitations have all been improved in GPT-4.0.</p>
        <p>However, although GPT-4.0 has been upgraded and improved compared with GPT-3.5, things could still be improved. GPT-4.0 has limitations in identifying symptoms and laboratory test data by classifying and analyzing causes of misdiagnosis. First, like GPT-3.5, GPT-4.0 has limitations in training data scope and clinical context, and experience affects the model’s ability to recognize and interpret specific medical data accurately. It may need more specific medical data, impacting its ability to accurately identify and analyze complex medical symptoms and laboratory results. Second, GPT-4.0 lacks clinical context and cannot accurately understand the clinical context in which medical data is presented [<xref ref-type="bibr" rid="ref21">21</xref>,<xref ref-type="bibr" rid="ref39">39</xref>]. For instance, interpreting laboratory results requires understanding the numbers and the clinical context including patient history and physical examination findings. Third, GPT-4.0 is a general model, not adjusted explicitly for medical applications [<xref ref-type="bibr" rid="ref40">40</xref>]. While it can process medical information, it cannot replace a human medical professional’s nuanced judgment and experiential learning [<xref ref-type="bibr" rid="ref41">41</xref>].</p>
        <p>In summary, although the evaluation results of GPT-4.0 in various classification diagnostic accuracies are better than GPT-3.5, it still has certain limitations, and its accuracy in medical diagnosis should be viewed with caution. It is a supportive tool [<xref ref-type="bibr" rid="ref42">42</xref>], not a replacement for professional medical advice and judgment. It has yet to fully reach the diagnostic ability of clinical doctors. In the future, it is still necessary to continue training and improving GPT-4.0’s ability to recognize patient symptoms and laboratory test data.</p>
      </sec>
      <sec>
        <title>Limitations</title>
        <p>This study has several limitations. First, we did not evaluate the models in real clinical settings. The dynamics of real-time clinical environments, complex and often unpredictable patient presentations, intraoperative findings, real imaging data, and pathological sections differ markedly from the controlled, historical data sets we used. Consequently, our findings do not fully encompass the challenges and variances in actual clinical practice. Second, we compared the diagnostic capabilities of GPT-3.5 and GPT-4.0 with predefined diagnoses in the literature rather than assessing their performance against the actual diagnostic abilities of human surgeons. Third, the study only included GPT-3.5 and GPT-4.0 models. With the rapid advancement of AI, numerous other models, such as Google’s Bard and Microsoft’s Copilot, have emerged. Future research should consider incorporating a broader range of AI models to evaluate their diagnostic capabilities comprehensively. Additionally, given that colorectal cancer is a prevalent disease in the field of surgery, this study merely serves as an initial exploration of ChatGPT’s application in surgery, using colorectal cancer as an example. We also did not assess GPT-3.5 and GPT-4.0’s ability to judge disease severity. In the future, we intend to include a broader range of surgical diseases to explore the potential of GPT-assisted clinical applications in surgery fully. The different prompts can yield varying answers. In this study, we crafted a simple and direct prompt and conducted multiple tests with both models, as subtle variations can influence the accuracy and relevance. Standardizing prompts is crucial to ensure the comparability and reliability of AI diagnostic performance assessments.</p>
        <p>Moreover, the diagnostic basis relied solely on textual patient records from existing literature, which only demonstrates GPT-3.5 and GPT-4.0’s ability to extract and analyze textual information. This study needs to include and evaluate imaging and pathology interpretation, which are critical components of comprehensive clinical diagnosis. Another limitation is that the patient records were originally in Chinese and were translated into English before querying GPT-3.5 and GPT-4.0. Both machine and human translations may only partially capture the nuances and details of the original records, potentially affecting the accuracy and completeness of the information presented to the models. Finally, the study’s design as a cross-sectional comparative study provides a lower level of evidence in the hierarchy of evidence-based medicine compared to randomized controlled trials. Future research should involve randomized controlled trials for more robust and reliable assessments of GPT-assisted clinical applications.</p>
      </sec>
      <sec>
        <title>Conclusions</title>
        <p>This study shows that ChatGPT has potential in medical diagnosis and may serve as a tool to assist doctors in clinical diagnosis and improve work efficiency. Generally, the diagnostic accuracy of GPT-4.0 is better than that of GPT-3.5, indicating that the upgrade of the ChatGPT version is affected. However, GPT-4.0 still has limitations regarding patient symptoms and laboratory data recognition, which needs to be further studied in the dynamic clinical practice environment in the future.</p>
      </sec>
    </sec>
  </body>
  <back>
    <app-group>
      <supplementary-material id="app1">
        <label>Multimedia Appendix 1</label>
        <p>The case prompt provided to ChatGPT.</p>
        <media xlink:href="jmir_v26i1e54985_app1.docx" xlink:title="DOCX File , 403 KB"/>
      </supplementary-material>
      <supplementary-material id="app2">
        <label>Multimedia Appendix 2</label>
        <p>Cohen Kappa statistic for GPT-3.5 or GPT-4.0 agreement.</p>
        <media xlink:href="jmir_v26i1e54985_app2.docx" xlink:title="DOCX File , 16 KB"/>
      </supplementary-material>
      <supplementary-material id="app3">
        <label>Multimedia Appendix 3</label>
        <p>Cohen Kappa statistic for senior surgeons agreement.</p>
        <media xlink:href="jmir_v26i1e54985_app3.docx" xlink:title="DOCX File , 16 KB"/>
      </supplementary-material>
    </app-group>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">AI</term>
          <def>
            <p>artificial intelligence</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb2">CDS</term>
          <def>
            <p>clinical decision support</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb3">CMAPH</term>
          <def>
            <p>Chinese Medical Association Publishing House</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb4">ICD-11</term>
          <def>
            <p>International Classification of Diseases, 11th Revision</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb5">LLM</term>
          <def>
            <p>large language model</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb6">PLA</term>
          <def>
            <p>People’s Liberation Army</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <notes>
      <sec>
        <title>Data Availability</title>
        <p>The data sets generated during and analyzed during this study are available from the corresponding author upon reasonable request.</p>
      </sec>
    </notes>
    <fn-group>
      <fn fn-type="con">
        <p>JL and JZ contributed to the literature search, study design, data collection, data interpretation, and writing. All use of ChatGPT in this study was done by JZ and HX. GC, YT, YL, XS, HC, PH, DF, FY, BW, XL, and YC provided feedback on manuscript texts. J Zhang and CY contributed to the study design and provided feedback on all manuscript texts.</p>
      </fn>
      <fn fn-type="conflict">
        <p>None declared.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Esteva</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Robicquet</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Ramsundar</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Kuleshov</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>DePristo</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Chou</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Cui</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Corrado</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Thrun</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Dean</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>A guide to deep learning in healthcare</article-title>
          <source>Nat Med</source>
          <year>2019</year>
          <volume>25</volume>
          <issue>1</issue>
          <fpage>24</fpage>
          <lpage>29</lpage>
          <pub-id pub-id-type="doi">10.1038/s41591-018-0316-z</pub-id>
          <pub-id pub-id-type="medline">30617335</pub-id>
          <pub-id pub-id-type="pii">10.1038/s41591-018-0316-z</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="web">
          <article-title>Introducing ChatGPT</article-title>
          <source>Open AI</source>
          <year>2022</year>
          <access-date>2023-08-01</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://openai.com/blog/chatgpt">https://openai.com/blog/chatgpt</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Gordijn</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Have</surname>
              <given-names>HT</given-names>
            </name>
          </person-group>
          <article-title>ChatGPT: evolution or revolution?</article-title>
          <source>Med Health Care Philos</source>
          <year>2023</year>
          <volume>26</volume>
          <issue>1</issue>
          <fpage>1</fpage>
          <lpage>2</lpage>
          <pub-id pub-id-type="doi">10.1007/s11019-023-10136-0</pub-id>
          <pub-id pub-id-type="medline">36656495</pub-id>
          <pub-id pub-id-type="pii">10.1007/s11019-023-10136-0</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Abdel-Messih</surname>
              <given-names>MS</given-names>
            </name>
            <name name-style="western">
              <surname>Boulos</surname>
              <given-names>MNK</given-names>
            </name>
          </person-group>
          <article-title>ChatGPT in clinical toxicology</article-title>
          <source>JMIR Med Educ</source>
          <year>2023</year>
          <volume>9</volume>
          <fpage>e46876</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://mededu.jmir.org/2023//e46876/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/46876</pub-id>
          <pub-id pub-id-type="medline">36867743</pub-id>
          <pub-id pub-id-type="pii">v9i1e46876</pub-id>
          <pub-id pub-id-type="pmcid">PMC10034604</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Dave</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Athaluri</surname>
              <given-names>SA</given-names>
            </name>
            <name name-style="western">
              <surname>Singh</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>ChatGPT in medicine: an overview of its applications, advantages, limitations, future prospects, and ethical considerations</article-title>
          <source>Front Artif Intell</source>
          <year>2023</year>
          <volume>6</volume>
          <fpage>1169595</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/37215063"/>
          </comment>
          <pub-id pub-id-type="doi">10.3389/frai.2023.1169595</pub-id>
          <pub-id pub-id-type="medline">37215063</pub-id>
          <pub-id pub-id-type="pmcid">PMC10192861</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref6">
        <label>6</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Rao</surname>
              <given-names>SJ</given-names>
            </name>
            <name name-style="western">
              <surname>Isath</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Krishnan</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Tangsrivimol</surname>
              <given-names>JA</given-names>
            </name>
            <name name-style="western">
              <surname>Virk</surname>
              <given-names>HUH</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Glicksberg</surname>
              <given-names>BS</given-names>
            </name>
            <name name-style="western">
              <surname>Krittanawong</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>ChatGPT: a conceptual review of applications and utility in the field of medicine</article-title>
          <source>J Med Syst</source>
          <year>2024</year>
          <volume>48</volume>
          <issue>1</issue>
          <fpage>59</fpage>
          <pub-id pub-id-type="doi">10.1007/s10916-024-02075-x</pub-id>
          <pub-id pub-id-type="medline">38836893</pub-id>
          <pub-id pub-id-type="pii">10.1007/s10916-024-02075-x</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref7">
        <label>7</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kung</surname>
              <given-names>TH</given-names>
            </name>
            <name name-style="western">
              <surname>Cheatham</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Medenilla</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Sillos</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>De Leon</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Elepaño</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Madriaga</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Aggabao</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Diaz-Candido</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Maningo</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Tseng</surname>
              <given-names>V</given-names>
            </name>
          </person-group>
          <article-title>Performance of ChatGPT on USMLE: potential for AI-assisted medical education using large language models</article-title>
          <source>PLOS Digit Health</source>
          <year>2023</year>
          <volume>2</volume>
          <issue>2</issue>
          <fpage>e0000198</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/36812645"/>
          </comment>
          <pub-id pub-id-type="doi">10.1371/journal.pdig.0000198</pub-id>
          <pub-id pub-id-type="medline">36812645</pub-id>
          <pub-id pub-id-type="pii">PDIG-D-22-00371</pub-id>
          <pub-id pub-id-type="pmcid">PMC9931230</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref8">
        <label>8</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Zheng</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Cai</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Yin</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>A descriptive study based on the comparison of ChatGPT and evidence-based neurosurgeons</article-title>
          <source>iScience</source>
          <year>2023</year>
          <volume>26</volume>
          <issue>9</issue>
          <fpage>107590</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://linkinghub.elsevier.com/retrieve/pii/S2589-0042(23)01667-X"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.isci.2023.107590</pub-id>
          <pub-id pub-id-type="medline">37705958</pub-id>
          <pub-id pub-id-type="pii">S2589-0042(23)01667-X</pub-id>
          <pub-id pub-id-type="pmcid">PMC10495632</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref9">
        <label>9</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Sung</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Ferlay</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Siegel</surname>
              <given-names>RL</given-names>
            </name>
            <name name-style="western">
              <surname>Laversanne</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Soerjomataram</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Jemal</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Bray</surname>
              <given-names>F</given-names>
            </name>
          </person-group>
          <article-title>Global cancer statistics 2020: GLOBOCAN estimates of incidence and mortality worldwide for 36 cancers in 185 countries</article-title>
          <source>CA Cancer J Clin</source>
          <year>2021</year>
          <volume>71</volume>
          <issue>3</issue>
          <fpage>209</fpage>
          <lpage>249</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://onlinelibrary.wiley.com/doi/10.3322/caac.21660"/>
          </comment>
          <pub-id pub-id-type="doi">10.3322/caac.21660</pub-id>
          <pub-id pub-id-type="medline">33538338</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref10">
        <label>10</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Holtedahl</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Borgquist</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Donker</surname>
              <given-names>GA</given-names>
            </name>
            <name name-style="western">
              <surname>Buntinx</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Weller</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Campbell</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Månsson</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Hammersley</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Braaten</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Parajuli</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>Symptoms and signs of colorectal cancer, with differences between proximal and distal colon cancer: a prospective cohort study of diagnostic accuracy in primary care</article-title>
          <source>BMC Fam Pract</source>
          <year>2021</year>
          <volume>22</volume>
          <issue>1</issue>
          <fpage>148</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://bmcfampract.biomedcentral.com/articles/10.1186/s12875-021-01452-6"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/s12875-021-01452-6</pub-id>
          <pub-id pub-id-type="medline">34238248</pub-id>
          <pub-id pub-id-type="pii">10.1186/s12875-021-01452-6</pub-id>
          <pub-id pub-id-type="pmcid">PMC8268573</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref11">
        <label>11</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>De Panfilis</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Peruselli</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Tanzi</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Botrugno</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>AI-based clinical decision-making systems in palliative medicine: ethical challenges</article-title>
          <source>BMJ Support Palliat Care</source>
          <year>2023</year>
          <volume>13</volume>
          <issue>2</issue>
          <fpage>183</fpage>
          <lpage>189</lpage>
          <pub-id pub-id-type="doi">10.1136/bmjspcare-2021-002948</pub-id>
          <pub-id pub-id-type="medline">34257065</pub-id>
          <pub-id pub-id-type="pii">bmjspcare-2021-002948</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref12">
        <label>12</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Rao</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Pang</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Kamineni</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Lie</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Prasad</surname>
              <given-names>AK</given-names>
            </name>
            <name name-style="western">
              <surname>Landman</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Dreyer</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Succi</surname>
              <given-names>MD</given-names>
            </name>
          </person-group>
          <article-title>Assessing the utility of ChatGPT throughout the entire clinical workflow: development and usability study</article-title>
          <source>J Med Internet Res</source>
          <year>2023</year>
          <volume>25</volume>
          <fpage>e48659</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.jmir.org/2023//e48659/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/48659</pub-id>
          <pub-id pub-id-type="medline">37606976</pub-id>
          <pub-id pub-id-type="pii">v25i1e48659</pub-id>
          <pub-id pub-id-type="pmcid">PMC10481210</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref13">
        <label>13</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Sharma</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Savage</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Nair</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Larsson</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Svedberg</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Nygren</surname>
              <given-names>JM</given-names>
            </name>
          </person-group>
          <article-title>Artificial intelligence applications in health care practice: scoping review</article-title>
          <source>J Med Internet Res</source>
          <year>2022</year>
          <volume>24</volume>
          <issue>10</issue>
          <fpage>e40238</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.jmir.org/2022/10/e40238/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/40238</pub-id>
          <pub-id pub-id-type="medline">36197712</pub-id>
          <pub-id pub-id-type="pii">v24i10e40238</pub-id>
          <pub-id pub-id-type="pmcid">PMC9582911</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref14">
        <label>14</label>
        <nlm-citation citation-type="web">
          <source>Chinese Medical Association Publishing House (CMAPH)</source>
          <access-date>2023-07-20</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.yiigle.com/index">https://www.yiigle.com/index</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref15">
        <label>15</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Harrison</surname>
              <given-names>JE</given-names>
            </name>
            <name name-style="western">
              <surname>Weber</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Jakob</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Chute</surname>
              <given-names>CG</given-names>
            </name>
          </person-group>
          <article-title>ICD-11: an international classification of diseases for the twenty-first century</article-title>
          <source>BMC Med Inform Decis Mak</source>
          <year>2021</year>
          <volume>21</volume>
          <issue>Suppl 6</issue>
          <fpage>206</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://bmcmedinformdecismak.biomedcentral.com/articles/10.1186/s12911-021-01534-6"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/s12911-021-01534-6</pub-id>
          <pub-id pub-id-type="medline">34753471</pub-id>
          <pub-id pub-id-type="pii">10.1186/s12911-021-01534-6</pub-id>
          <pub-id pub-id-type="pmcid">PMC8577172</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref16">
        <label>16</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <collab>The Lancet</collab>
          </person-group>
          <article-title>ICD-11</article-title>
          <source>Lancet</source>
          <year>2019</year>
          <volume>393</volume>
          <issue>10188</issue>
          <fpage>2275</fpage>
          <pub-id pub-id-type="doi">10.1016/S0140-6736(19)31205-X</pub-id>
          <pub-id pub-id-type="medline">31180012</pub-id>
          <pub-id pub-id-type="pii">S0140-6736(19)31205-X</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref17">
        <label>17</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Vaira</surname>
              <given-names>LA</given-names>
            </name>
            <name name-style="western">
              <surname>Lechien</surname>
              <given-names>JR</given-names>
            </name>
            <name name-style="western">
              <surname>Abbate</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Allevi</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Audino</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Beltramini</surname>
              <given-names>GA</given-names>
            </name>
            <name name-style="western">
              <surname>Bergonzani</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Bolzoni</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Committeri</surname>
              <given-names>U</given-names>
            </name>
            <name name-style="western">
              <surname>Crimi</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Gabriele</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Lonardi</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Maglitto</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Petrocelli</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Pucci</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Saponaro</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Tel</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Vellone</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Chiesa-Estomba</surname>
              <given-names>CM</given-names>
            </name>
            <name name-style="western">
              <surname>Boscolo-Rizzo</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Salzano</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>De Riu</surname>
              <given-names>G</given-names>
            </name>
          </person-group>
          <article-title>Accuracy of ChatGPT-generated information on head and neck and oromaxillofacial surgery: a multicenter collaborative analysis</article-title>
          <source>Otolaryngol Head Neck Surg</source>
          <year>2024</year>
          <volume>170</volume>
          <issue>6</issue>
          <fpage>1492</fpage>
          <lpage>1503</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://air.unimi.it/handle/2434/1024627"/>
          </comment>
          <pub-id pub-id-type="doi">10.1002/ohn.489</pub-id>
          <pub-id pub-id-type="medline">37595113</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref18">
        <label>18</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Krusche</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Callhoff</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Knitza</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Ruffer</surname>
              <given-names>N</given-names>
            </name>
          </person-group>
          <article-title>Diagnostic accuracy of a large language model in rheumatology: comparison of physician and ChatGPT-4</article-title>
          <source>Rheumatol Int</source>
          <year>2024</year>
          <volume>44</volume>
          <issue>2</issue>
          <fpage>303</fpage>
          <lpage>306</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/37742280"/>
          </comment>
          <pub-id pub-id-type="doi">10.1007/s00296-023-05464-6</pub-id>
          <pub-id pub-id-type="medline">37742280</pub-id>
          <pub-id pub-id-type="pii">10.1007/s00296-023-05464-6</pub-id>
          <pub-id pub-id-type="pmcid">PMC10796566</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref19">
        <label>19</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ali</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Tang</surname>
              <given-names>OY</given-names>
            </name>
            <name name-style="western">
              <surname>Connolly</surname>
              <given-names>ID</given-names>
            </name>
            <name name-style="western">
              <surname>Fridley</surname>
              <given-names>JS</given-names>
            </name>
            <name name-style="western">
              <surname>Shin</surname>
              <given-names>JH</given-names>
            </name>
            <name name-style="western">
              <surname>Zadnik Sullivan</surname>
              <given-names>PL</given-names>
            </name>
            <name name-style="western">
              <surname>Cielo</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Oyelese</surname>
              <given-names>AA</given-names>
            </name>
            <name name-style="western">
              <surname>Doberstein</surname>
              <given-names>CE</given-names>
            </name>
            <name name-style="western">
              <surname>Telfeian</surname>
              <given-names>AE</given-names>
            </name>
            <name name-style="western">
              <surname>Gokaslan</surname>
              <given-names>ZL</given-names>
            </name>
            <name name-style="western">
              <surname>Asaad</surname>
              <given-names>WF</given-names>
            </name>
          </person-group>
          <article-title>Performance of ChatGPT, GPT-4, and Google bard on a neurosurgery oral boards preparation question bank</article-title>
          <source>Neurosurgery</source>
          <year>2023</year>
          <volume>93</volume>
          <issue>5</issue>
          <fpage>1090</fpage>
          <lpage>1098</lpage>
          <pub-id pub-id-type="doi">10.1227/neu.0000000000002551</pub-id>
          <pub-id pub-id-type="medline">37306460</pub-id>
          <pub-id pub-id-type="pii">00006123-990000000-00775</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref20">
        <label>20</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Franck</surname>
              <given-names>CP</given-names>
            </name>
            <name name-style="western">
              <surname>Babington-Ashaye</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Dietrich</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Bediang</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Veltsos</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Gupta</surname>
              <given-names>PP</given-names>
            </name>
            <name name-style="western">
              <surname>Juech</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Kadam</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Collin</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Setian</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Pons</surname>
              <given-names>JS</given-names>
            </name>
            <name name-style="western">
              <surname>Kwankam</surname>
              <given-names>SY</given-names>
            </name>
            <name name-style="western">
              <surname>Garrette</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Barbe</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Bagayoko</surname>
              <given-names>CO</given-names>
            </name>
            <name name-style="western">
              <surname>Mehl</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Lovis</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Geissbuhler</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>iCHECK-DH: guidelines and checklist for the reporting on digital health implementations</article-title>
          <source>J Med Internet Res</source>
          <year>2023</year>
          <volume>25</volume>
          <fpage>e46694</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.jmir.org/2023//e46694/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/46694</pub-id>
          <pub-id pub-id-type="medline">37163336</pub-id>
          <pub-id pub-id-type="pii">v25i1e46694</pub-id>
          <pub-id pub-id-type="pmcid">PMC10209789</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref21">
        <label>21</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Taloni</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Borselli</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Scarsi</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Rossi</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Coco</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Scorcia</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Giannaccare</surname>
              <given-names>G</given-names>
            </name>
          </person-group>
          <article-title>Comparative performance of humans versus GPT-4.0 and GPT-3.5 in the self-assessment program of American academy of ophthalmology</article-title>
          <source>Sci Rep</source>
          <year>2023</year>
          <volume>13</volume>
          <issue>1</issue>
          <fpage>18562</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1038/s41598-023-45837-2"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/s41598-023-45837-2</pub-id>
          <pub-id pub-id-type="medline">37899405</pub-id>
          <pub-id pub-id-type="pii">10.1038/s41598-023-45837-2</pub-id>
          <pub-id pub-id-type="pmcid">PMC10613606</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref22">
        <label>22</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>WH</given-names>
            </name>
            <name name-style="western">
              <surname>Chan</surname>
              <given-names>YH</given-names>
            </name>
            <name name-style="western">
              <surname>Huang</surname>
              <given-names>CP</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>TJ</given-names>
            </name>
          </person-group>
          <article-title>Comparative analysis of GPT-3.5 and GPT-4.0 in Taiwan's medical technologist certification: a study in artificial intelligence advancements</article-title>
          <source>J Chin Med Assoc</source>
          <year>2024</year>
          <volume>87</volume>
          <issue>5</issue>
          <fpage>525</fpage>
          <lpage>530</lpage>
          <pub-id pub-id-type="doi">10.1097/JCMA.0000000000001092</pub-id>
          <pub-id pub-id-type="medline">38551357</pub-id>
          <pub-id pub-id-type="pii">02118582-990000000-00370</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref23">
        <label>23</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Deng</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>T</given-names>
            </name>
            <collab>Yangzhang</collab>
            <name name-style="western">
              <surname>Zhai</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Tao</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Zhao</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Luo</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Xu</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Evaluation of large language models in breast cancer clinical scenarios: a comparative analysis based on ChatGPT-3.5, ChatGPT-4.0, and Claude2</article-title>
          <source>Int J Surg</source>
          <year>2024</year>
          <volume>110</volume>
          <issue>4</issue>
          <fpage>1941</fpage>
          <lpage>1950</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/38668655"/>
          </comment>
          <pub-id pub-id-type="doi">10.1097/JS9.0000000000001066</pub-id>
          <pub-id pub-id-type="medline">38668655</pub-id>
          <pub-id pub-id-type="pii">01279778-202404000-00008</pub-id>
          <pub-id pub-id-type="pmcid">PMC11019981</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref24">
        <label>24</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Antaki</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Milad</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Chia</surname>
              <given-names>MA</given-names>
            </name>
            <name name-style="western">
              <surname>Giguère</surname>
              <given-names>CÉ</given-names>
            </name>
            <name name-style="western">
              <surname>Touma</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>El-Khoury</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Keane</surname>
              <given-names>PA</given-names>
            </name>
            <name name-style="western">
              <surname>Duval</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>Capabilities of GPT-4 in ophthalmology: an analysis of model entropy and progress towards human-level medical question answering</article-title>
          <source>Br J Ophthalmol</source>
          <year>2023</year>
          <fpage>bjo-2023</fpage>
          <lpage>324438</lpage>
          <pub-id pub-id-type="doi">10.1136/bjo-2023-324438</pub-id>
          <pub-id pub-id-type="medline">37923374</pub-id>
          <pub-id pub-id-type="pii">bjo-2023-324438</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref25">
        <label>25</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Howard</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Hope</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Gerada</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>ChatGPT and antimicrobial advice: the end of the consulting infection doctor?</article-title>
          <source>Lancet Infect Dis</source>
          <year>2023</year>
          <volume>23</volume>
          <issue>4</issue>
          <fpage>405</fpage>
          <lpage>406</lpage>
          <pub-id pub-id-type="doi">10.1016/S1473-3099(23)00113-5</pub-id>
          <pub-id pub-id-type="medline">36822213</pub-id>
          <pub-id pub-id-type="pii">S1473-3099(23)00113-5</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref26">
        <label>26</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Nastasi</surname>
              <given-names>AJ</given-names>
            </name>
            <name name-style="western">
              <surname>Courtright</surname>
              <given-names>KR</given-names>
            </name>
            <name name-style="western">
              <surname>Halpern</surname>
              <given-names>SD</given-names>
            </name>
            <name name-style="western">
              <surname>Weissman</surname>
              <given-names>GE</given-names>
            </name>
          </person-group>
          <article-title>A vignette-based evaluation of ChatGPT's ability to provide appropriate and equitable medical advice across care contexts</article-title>
          <source>Sci Rep</source>
          <year>2023</year>
          <volume>13</volume>
          <issue>1</issue>
          <fpage>17885</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1038/s41598-023-45223-y"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/s41598-023-45223-y</pub-id>
          <pub-id pub-id-type="medline">37857839</pub-id>
          <pub-id pub-id-type="pii">10.1038/s41598-023-45223-y</pub-id>
          <pub-id pub-id-type="pmcid">PMC10587094</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref27">
        <label>27</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lukac</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Dayan</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Fink</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Leinert</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Hartkopf</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Veselinovic</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Janni</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Rack</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Pfister</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Heitmeir</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Ebner</surname>
              <given-names>F</given-names>
            </name>
          </person-group>
          <article-title>Evaluating ChatGPT as an adjunct for the multidisciplinary tumor board decision-making in primary breast cancer cases</article-title>
          <source>Arch Gynecol Obstet</source>
          <year>2023</year>
          <volume>308</volume>
          <issue>6</issue>
          <fpage>1831</fpage>
          <lpage>1844</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/37458761"/>
          </comment>
          <pub-id pub-id-type="doi">10.1007/s00404-023-07130-5</pub-id>
          <pub-id pub-id-type="medline">37458761</pub-id>
          <pub-id pub-id-type="pii">10.1007/s00404-023-07130-5</pub-id>
          <pub-id pub-id-type="pmcid">PMC10579162</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref28">
        <label>28</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Eggmann</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Blatz</surname>
              <given-names>MB</given-names>
            </name>
          </person-group>
          <article-title>ChatGPT: chances and challenges for dentistry</article-title>
          <source>Compend Contin Educ Dent</source>
          <year>2023</year>
          <volume>44</volume>
          <issue>4</issue>
          <fpage>220</fpage>
          <lpage>224</lpage>
          <pub-id pub-id-type="medline">37075729</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref29">
        <label>29</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kuroiwa</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Sarcon</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Ibara</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Yamada</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Yamamoto</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Tsukamoto</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Fujita</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>The potential of ChatGPT as a self-diagnostic tool in common orthopedic diseases: exploratory study</article-title>
          <source>J Med Internet Res</source>
          <year>2023</year>
          <volume>25</volume>
          <fpage>e47621</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.jmir.org/2023//e47621/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/47621</pub-id>
          <pub-id pub-id-type="medline">37713254</pub-id>
          <pub-id pub-id-type="pii">v25i1e47621</pub-id>
          <pub-id pub-id-type="pmcid">PMC10541638</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref30">
        <label>30</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Tan</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Xin</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>ChatGPT in medicine: prospects and challenges: a review article</article-title>
          <source>Int J Surg</source>
          <year>2024</year>
          <volume>110</volume>
          <issue>6</issue>
          <fpage>3701</fpage>
          <lpage>3706</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/38502861"/>
          </comment>
          <pub-id pub-id-type="doi">10.1097/JS9.0000000000001312</pub-id>
          <pub-id pub-id-type="medline">38502861</pub-id>
          <pub-id pub-id-type="pii">01279778-990000000-01226</pub-id>
          <pub-id pub-id-type="pmcid">PMC11175750</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref31">
        <label>31</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Currie</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Robbie</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Tually</surname>
              <given-names>P</given-names>
            </name>
          </person-group>
          <article-title>ChatGPT and patient information in nuclear medicine: GPT-3.5 versus GPT-4</article-title>
          <source>J Nucl Med Technol</source>
          <year>2023</year>
          <volume>51</volume>
          <issue>4</issue>
          <fpage>307</fpage>
          <lpage>313</lpage>
          <pub-id pub-id-type="doi">10.2967/jnmt.123.266151</pub-id>
          <pub-id pub-id-type="medline">37699647</pub-id>
          <pub-id pub-id-type="pii">jnmt.123.266151</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref32">
        <label>32</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Duey</surname>
              <given-names>AH</given-names>
            </name>
            <name name-style="western">
              <surname>Nietsch</surname>
              <given-names>KS</given-names>
            </name>
            <name name-style="western">
              <surname>Zaidat</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Ren</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Ndjonko</surname>
              <given-names>LCM</given-names>
            </name>
            <name name-style="western">
              <surname>Shrestha</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Rajjoub</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Ahmed</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Hoang</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Saturno</surname>
              <given-names>MP</given-names>
            </name>
            <name name-style="western">
              <surname>Tang</surname>
              <given-names>JE</given-names>
            </name>
            <name name-style="western">
              <surname>Gallate</surname>
              <given-names>ZS</given-names>
            </name>
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>JS</given-names>
            </name>
            <name name-style="western">
              <surname>Cho</surname>
              <given-names>SK</given-names>
            </name>
          </person-group>
          <article-title>Thromboembolic prophylaxis in spine surgery: an analysis of ChatGPT recommendations</article-title>
          <source>Spine J</source>
          <year>2023</year>
          <volume>23</volume>
          <issue>11</issue>
          <fpage>1684</fpage>
          <lpage>1691</lpage>
          <pub-id pub-id-type="doi">10.1016/j.spinee.2023.07.015</pub-id>
          <pub-id pub-id-type="medline">37499880</pub-id>
          <pub-id pub-id-type="pii">S1529-9430(23)03285-0</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref33">
        <label>33</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Guerra</surname>
              <given-names>GA</given-names>
            </name>
            <name name-style="western">
              <surname>Hofmann</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Sobhani</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Hofmann</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Gomez</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Soroudi</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Hopkins</surname>
              <given-names>BS</given-names>
            </name>
            <name name-style="western">
              <surname>Dallas</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Pangal</surname>
              <given-names>DJ</given-names>
            </name>
            <name name-style="western">
              <surname>Cheok</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Nguyen</surname>
              <given-names>VN</given-names>
            </name>
            <name name-style="western">
              <surname>Mack</surname>
              <given-names>WJ</given-names>
            </name>
            <name name-style="western">
              <surname>Zada</surname>
              <given-names>G</given-names>
            </name>
          </person-group>
          <article-title>GPT-4 artificial intelligence model outperforms ChatGPT, medical students, and neurosurgery residents on neurosurgery written board-like questions</article-title>
          <source>World Neurosurg</source>
          <year>2023</year>
          <volume>179</volume>
          <fpage>e160</fpage>
          <lpage>e165</lpage>
          <pub-id pub-id-type="doi">10.1016/j.wneu.2023.08.042</pub-id>
          <pub-id pub-id-type="medline">37597659</pub-id>
          <pub-id pub-id-type="pii">S1878-8750(23)01144-0</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref34">
        <label>34</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Liang</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Zhao</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Peng</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Xu</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Zhong</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Yi</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Hou</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Enhanced artificial intelligence strategies in renal oncology: iterative optimization and comparative analysis of GPT 3.5 versus 4.0</article-title>
          <source>Ann Surg Oncol</source>
          <year>2024</year>
          <volume>31</volume>
          <issue>6</issue>
          <fpage>3887</fpage>
          <lpage>3893</lpage>
          <pub-id pub-id-type="doi">10.1245/s10434-024-15107-0</pub-id>
          <pub-id pub-id-type="medline">38472675</pub-id>
          <pub-id pub-id-type="pii">10.1245/s10434-024-15107-0</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref35">
        <label>35</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hasani</surname>
              <given-names>AM</given-names>
            </name>
            <name name-style="western">
              <surname>Singh</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Zahergivar</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Ryan</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Nethala</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Bravomontenegro</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Mendhiratta</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Ball</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Farhadi</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Malayeri</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Evaluating the performance of generative pre-trained transformer-4 (GPT-4) in standardizing radiology reports</article-title>
          <source>Eur Radiol</source>
          <year>2024</year>
          <volume>34</volume>
          <issue>6</issue>
          <fpage>3566</fpage>
          <lpage>3574</lpage>
          <pub-id pub-id-type="doi">10.1007/s00330-023-10384-x</pub-id>
          <pub-id pub-id-type="medline">37938381</pub-id>
          <pub-id pub-id-type="pii">10.1007/s00330-023-10384-x</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref36">
        <label>36</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kanjee</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Crowe</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Rodman</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Accuracy of a generative artificial intelligence model in a complex diagnostic challenge</article-title>
          <source>JAMA</source>
          <year>2023</year>
          <volume>330</volume>
          <issue>1</issue>
          <fpage>78</fpage>
          <lpage>80</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/37318797"/>
          </comment>
          <pub-id pub-id-type="doi">10.1001/jama.2023.8288</pub-id>
          <pub-id pub-id-type="medline">37318797</pub-id>
          <pub-id pub-id-type="pii">2806457</pub-id>
          <pub-id pub-id-type="pmcid">PMC10273128</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref37">
        <label>37</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Horiuchi</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Tatekawa</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Oura</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Oue</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Walston</surname>
              <given-names>SL</given-names>
            </name>
            <name name-style="western">
              <surname>Takita</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Matsushita</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Mitsuyama</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Shimono</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Miki</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Ueda</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>Comparing the diagnostic performance of GPT-4-based ChatGPT, GPT-4V-based ChatGPT, and radiologists in challenging neuroradiology cases</article-title>
          <source>Clin Neuroradiol</source>
          <year>2024</year>
          <pub-id pub-id-type="doi">10.1007/s00062-024-01426-y</pub-id>
          <pub-id pub-id-type="medline">38806794</pub-id>
          <pub-id pub-id-type="pii">10.1007/s00062-024-01426-y</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref38">
        <label>38</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Giannos</surname>
              <given-names>P</given-names>
            </name>
          </person-group>
          <article-title>Evaluating the limits of AI in medical specialisation: ChatGPT's performance on the UK neurology specialty certificate examination</article-title>
          <source>BMJ Neurol Open</source>
          <year>2023</year>
          <volume>5</volume>
          <issue>1</issue>
          <fpage>e000451</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/37337531"/>
          </comment>
          <pub-id pub-id-type="doi">10.1136/bmjno-2023-000451</pub-id>
          <pub-id pub-id-type="medline">37337531</pub-id>
          <pub-id pub-id-type="pii">bmjno-2023-000451</pub-id>
          <pub-id pub-id-type="pmcid">PMC10277081</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref39">
        <label>39</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ahmed</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Saturno</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Rajjoub</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Duey</surname>
              <given-names>AH</given-names>
            </name>
            <name name-style="western">
              <surname>Zaidat</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Hoang</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Restrepo Mejia</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Gallate</surname>
              <given-names>ZS</given-names>
            </name>
            <name name-style="western">
              <surname>Shrestha</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Tang</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Zapolsky</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>JS</given-names>
            </name>
            <name name-style="western">
              <surname>Cho</surname>
              <given-names>SK</given-names>
            </name>
          </person-group>
          <article-title>ChatGPT versus NASS clinical guidelines for degenerative spondylolisthesis: a comparative analysis</article-title>
          <source>Eur Spine J</source>
          <year>2024</year>
          <pub-id pub-id-type="doi">10.1007/s00586-024-08198-6</pub-id>
          <pub-id pub-id-type="medline">38489044</pub-id>
          <pub-id pub-id-type="pii">10.1007/s00586-024-08198-6</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref40">
        <label>40</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Delsoz</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Madadi</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Raja</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Munir</surname>
              <given-names>WM</given-names>
            </name>
            <name name-style="western">
              <surname>Tamm</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Mehravaran</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Soleimani</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Djalilian</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Yousefi</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Performance of ChatGPT in diagnosis of corneal eye diseases</article-title>
          <source>Cornea</source>
          <year>2024</year>
          <volume>43</volume>
          <issue>5</issue>
          <fpage>664</fpage>
          <lpage>670</lpage>
          <pub-id pub-id-type="doi">10.1097/ICO.0000000000003492</pub-id>
          <pub-id pub-id-type="medline">38391243</pub-id>
          <pub-id pub-id-type="pii">00003226-202405000-00019</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref41">
        <label>41</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Li</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Exploring the clinical capabilities and limitations of ChatGPT: a cautionary tale for medical applications</article-title>
          <source>Int J Surg</source>
          <year>2023</year>
          <volume>109</volume>
          <issue>9</issue>
          <fpage>2865</fpage>
          <lpage>2867</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/37222684"/>
          </comment>
          <pub-id pub-id-type="doi">10.1097/JS9.0000000000000523</pub-id>
          <pub-id pub-id-type="medline">37222684</pub-id>
          <pub-id pub-id-type="pii">01279778-990000000-00387</pub-id>
          <pub-id pub-id-type="pmcid">PMC10498888</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref42">
        <label>42</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Cheng</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Guo</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Sun</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>Emergency surgery in the era of artificial intelligence: ChatGPT could be the doctor's right-hand man</article-title>
          <source>Int J Surg</source>
          <year>2023</year>
          <volume>109</volume>
          <issue>6</issue>
          <fpage>1816</fpage>
          <lpage>1818</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/37074733"/>
          </comment>
          <pub-id pub-id-type="doi">10.1097/JS9.0000000000000410</pub-id>
          <pub-id pub-id-type="medline">37074733</pub-id>
          <pub-id pub-id-type="pii">01279778-202306000-00034</pub-id>
          <pub-id pub-id-type="pmcid">PMC10389530</pub-id>
        </nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
