<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article" dtd-version="2.0">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JMIR</journal-id>
      <journal-id journal-id-type="nlm-ta">J Med Internet Res</journal-id>
      <journal-title>Journal of Medical Internet Research</journal-title>
      <issn pub-type="epub">1438-8871</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
      <article-id pub-id-type="publisher-id">v25i1e41043</article-id>
      <article-id pub-id-type="pmid">36637893</article-id>
      <article-id pub-id-type="doi">10.2196/41043</article-id>
      <article-categories>
        <subj-group subj-group-type="heading">
          <subject>Original Paper</subject>
        </subj-group>
        <subj-group subj-group-type="article-type">
          <subject>Original Paper</subject>
        </subj-group>
      </article-categories>
      <title-group>
        <article-title>An Accurate Deep Learning–Based System for Automatic Pill Identification: Model Development and Validation</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="editor">
          <name>
            <surname>Leung</surname>
            <given-names>Tiffany</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Waterson</surname>
            <given-names>James</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Li</surname>
            <given-names>Zhongqiang</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Soerensen </surname>
            <given-names>Simon John Christoph</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib id="contrib1" contrib-type="author" equal-contrib="yes">
          <name name-style="western">
            <surname>Heo</surname>
            <given-names>Junyeong</given-names>
          </name>
          <degrees>BMath</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <xref rid="aff2" ref-type="aff">2</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-5525-9069</ext-link>
        </contrib>
        <contrib id="contrib2" contrib-type="author" equal-contrib="yes">
          <name name-style="western">
            <surname>Kang</surname>
            <given-names>Youjin</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff3" ref-type="aff">3</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-6808-5157</ext-link>
        </contrib>
        <contrib id="contrib3" contrib-type="author">
          <name name-style="western">
            <surname>Lee</surname>
            <given-names>SangKeun</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff3" ref-type="aff">3</xref>
          <xref rid="aff4" ref-type="aff">4</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-6249-8217</ext-link>
        </contrib>
        <contrib id="contrib4" contrib-type="author" corresp="yes" equal-contrib="yes">
          <name name-style="western">
            <surname>Jeong</surname>
            <given-names>Dong-Hwa</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff5" ref-type="aff">5</xref>
          <address>
            <institution>Department of Artificial Intelligence</institution>
            <institution>The Catholic University of Korea</institution>
            <addr-line>T908 Michael Building</addr-line>
            <addr-line>The Catholic University of Korea, 43 Jibong-ro</addr-line>
            <addr-line>Bucheon, 14662</addr-line>
            <country>Republic of Korea</country>
            <phone>82 10 6707 6977</phone>
            <email>donghwa@catholic.ac.kr</email>
          </address>
          <xref rid="aff6" ref-type="aff">6</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-4896-9681</ext-link>
        </contrib>
        <contrib id="contrib5" contrib-type="author" equal-contrib="yes">
          <name name-style="western">
            <surname>Kim</surname>
            <given-names>Kang-Min</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff5" ref-type="aff">5</xref>
          <xref rid="aff6" ref-type="aff">6</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-2335-7072</ext-link>
        </contrib>
      </contrib-group>
      <aff id="aff1">
        <label>1</label>
        <institution>Department of Mathematics</institution>
        <institution>Korea University</institution>
        <addr-line>Seoul</addr-line>
        <country>Republic of Korea</country>
      </aff>
      <aff id="aff2">
        <label>2</label>
        <institution>Visual Display Division</institution>
        <institution>Samsung Electronics</institution>
        <addr-line>Suwon</addr-line>
        <country>Republic of Korea</country>
      </aff>
      <aff id="aff3">
        <label>3</label>
        <institution>Department of Computer Science and Engineering</institution>
        <institution>Korea University</institution>
        <addr-line>Seoul</addr-line>
        <country>Republic of Korea</country>
      </aff>
      <aff id="aff4">
        <label>4</label>
        <institution>Department of Artificial Intelligence</institution>
        <institution>Korea University</institution>
        <addr-line>Seoul</addr-line>
        <country>Republic of Korea</country>
      </aff>
      <aff id="aff5">
        <label>5</label>
        <institution>Department of Artificial Intelligence</institution>
        <institution>The Catholic University of Korea</institution>
        <addr-line>Bucheon</addr-line>
        <country>Republic of Korea</country>
      </aff>
      <aff id="aff6">
        <label>6</label>
        <institution>Department of Data Science</institution>
        <institution>The Catholic University of Korea</institution>
        <addr-line>Bucheon</addr-line>
        <country>Republic of Korea</country>
      </aff>
      <author-notes>
        <corresp>Corresponding Author: Dong-Hwa Jeong <email>donghwa@catholic.ac.kr</email></corresp>
      </author-notes>
      <pub-date pub-type="collection">
        <year>2023</year>
      </pub-date>
      <pub-date pub-type="epub">
        <day>13</day>
        <month>1</month>
        <year>2023</year>
      </pub-date>
      <volume>25</volume>
      <elocation-id>e41043</elocation-id>
      <history>
        <date date-type="received">
          <day>13</day>
          <month>7</month>
          <year>2022</year>
        </date>
        <date date-type="rev-request">
          <day>19</day>
          <month>10</month>
          <year>2022</year>
        </date>
        <date date-type="rev-recd">
          <day>24</day>
          <month>11</month>
          <year>2022</year>
        </date>
        <date date-type="accepted">
          <day>25</day>
          <month>11</month>
          <year>2022</year>
        </date>
      </history>
      <copyright-statement>©Junyeong Heo, Youjin Kang, SangKeun Lee, Dong-Hwa Jeong, Kang-Min Kim. Originally published in the Journal of Medical Internet Research (https://www.jmir.org), 13.01.2023.</copyright-statement>
      <copyright-year>2023</copyright-year>
      <license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
        <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (https://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in the Journal of Medical Internet Research, is properly cited. The complete bibliographic information, a link to the original publication on https://www.jmir.org/, as well as this copyright and license information must be included.</p>
      </license>
      <self-uri xlink:href="https://www.jmir.org/2023/1/e41043" xlink:type="simple"/>
      <abstract>
        <sec sec-type="background">
          <title>Background</title>
          <p>Medication errors account for a large proportion of all medical errors. In most homes, patients take a variety of medications for a long period. However, medication errors frequently occur because patients often throw away the containers of their medications.</p>
        </sec>
        <sec sec-type="objective">
          <title>Objective</title>
          <p>We proposed a deep learning–based system for reducing medication errors by accurately identifying prescription pills. Given the pill images, our system located the pills in the respective pill databases in South Korea and the United States.</p>
        </sec>
        <sec sec-type="methods">
          <title>Methods</title>
          <p>We organized the system into a pill recognition step and pill retrieval step, and we applied deep learning models to train not only images of the pill but also imprinted characters. In the pill recognition step, there are 3 modules that recognize the 3 features of pills and their imprints separately and correct the recognized imprint to fit the actual data. We adopted image classification and text detection models for the feature and imprint recognition modules, respectively. In the imprint correction module, we introduced a language model for the first time in the pill identification system and proposed a novel coordinate encoding technique for effective correction in the language model. We identified pills using similarity scores of pill characteristics with those in the database.</p>
        </sec>
        <sec sec-type="results">
          <title>Results</title>
          <p>We collected the open pill database from South Korea and the United States in May 2022. We used a total of 24,404 pill images in our experiments. The experimental results show that the predicted top-1 candidates achieve accuracy levels of 85.6% (South Korea) and 74.5% (United States) for the types of pills not trained on 2 different databases (South Korea and the United States). Furthermore, the predicted top-1 candidate accuracy of our system was 78% with consumer-granted images, which was achieved by training only 1 image per pill. The results demonstrate that our system could identify and retrieve new pills without additional model updates. Finally, we confirmed through an ablation study that the language model that we emphasized significantly improves the pill identification ability of the system.</p>
        </sec>
        <sec sec-type="conclusions">
          <title>Conclusions</title>
          <p>Our study proposes the possibility of reducing medical errors by showing that the introduction of artificial intelligence can identify numerous pills with high precision in real time. Our study suggests that the proposed system can reduce patients’ misuse of medications and help medical staff focus on higher-level tasks by simplifying time-consuming lower-level tasks such as pill identification.</p>
        </sec>
      </abstract>
      <kwd-group>
        <kwd>pill identification</kwd>
        <kwd>pill retrieval</kwd>
        <kwd>pill recognition</kwd>
        <kwd>automatic pill search</kwd>
        <kwd>deep learning</kwd>
        <kwd>machine learning</kwd>
        <kwd>character-level language model</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <sec sec-type="introduction">
      <title>Introduction</title>
      <sec>
        <title>Background</title>
        <p>It is important to identify the type of medication to avoid medical errors because it directly influences patients’ health and causes enormous losses. The Organisation for Economic Co-operation and Development (OECD) reported that 18.3% of adverse events in patients are medication errors [<xref ref-type="bibr" rid="ref1">1</xref>]. The misuse of medications is a critical issue for patients, resulting in adverse medication effects such as injuries and complications [<xref ref-type="bibr" rid="ref2">2</xref>]. In addition, the estimated health care cost caused by adverse medication events was more than US $76 billion in 2014 and has had an upward tendency yearly [<xref ref-type="bibr" rid="ref3">3</xref>,<xref ref-type="bibr" rid="ref4">4</xref>]. Nowadays, the types of medications are highly diverse owing to the rapid development of new medications and increased international trade of medications. As a result, it burdens medical staff because they should find pills manually from the pill database whenever patients ask for identifying their prescribed medications. This is because patients often discard their medication’s container that includes the prescription. In addition, medication errors can be caused by polypharmacy such as prescribing cascade and poor medication reconciliation [<xref ref-type="bibr" rid="ref5">5</xref>]. To alleviate these errors, many countries have recently developed computerized medication systems that use information technology to identify medications and recognize the interactions between medications. For example, in South Korea, the Ministry of Food and Drug Safety (MFDS) has provided specifications for medications, including ingredients, images, and precautions. On the basis of their database, which is updated when a new pill is approved and enrolled, medical staff can check for potential adverse medication events, and patients can also identify medications [<xref ref-type="bibr" rid="ref6">6</xref>-<xref ref-type="bibr" rid="ref8">8</xref>].</p>
        <p>However, it is necessary to develop automated systems because the current computerized medication systems that require the passive input of surface information have disadvantages in terms of use. Although the current computerized medication systems are helpful in preventing the misuse of medications, they require pharmaceutical expertise and significant labor to search for pills. This is largely attributed to the fact that users such as medical staff and patients must manually enter the exact names or properties of the medications. Previous studies have shown that pharmacists at university hospitals have spent an average of approximately 20 hours a month identifying medication, but approximately 25% of the medications were not identifiable in the conventional system [<xref ref-type="bibr" rid="ref9">9</xref>,<xref ref-type="bibr" rid="ref10">10</xref>]. The results were similar when expanding the study scope to a city [<xref ref-type="bibr" rid="ref11">11</xref>]. They pointed out that this result was not only because of the broken pills but also because it was difficult for the medical staff to carefully identify pills using the conventional system. Although pills are the most widely used medication forms owing to their convenience of use and storage, it is difficult to identify the information on pills (eg, imprints) owing to their small size. Thus, although systems for identifying pills have been established in most countries, the preceding results highlight the limitations of the current pill search systems, despite their importance in the clinical field. In addition, it is very difficult for patients—who have no medical expertise—to identify pills while taking multiple doses of different medications in their daily lives. Therefore, many attempts have been made to develop automated pill search systems. These systems aim to help medical staff by mitigating the lower-level workload of pill identification and patients by providing appropriate information to avoid the intake of wrong pills. Although there are several studies aimed at developing automated pill identification systems, most of them have focused on recognizing characters imprinted on pill surfaces [<xref ref-type="bibr" rid="ref12">12</xref>-<xref ref-type="bibr" rid="ref16">16</xref>]. However, because the imprints on pills are small and contain abbreviations, it is difficult to recognize them quickly and accurately. Therefore, it is necessary to develop a fast and accurate automatic system that identifies pills by recognizing the imprinted characters with high accuracy.</p>
      </sec>
      <sec>
        <title>Prior Work</title>
        <p>Surveys on medication errors and misuse highlighted the need for automatic, accurate, and rapid pill identification [<xref ref-type="bibr" rid="ref4">4</xref>,<xref ref-type="bibr" rid="ref17">17</xref>,<xref ref-type="bibr" rid="ref18">18</xref>]. Recently, several studies have proposed artificial intelligence (AI)–based pill recognition systems. These approaches can be a promising research direction because the performance of medical staff in pill identification was improved with the assistance of an AI-based system. Cho et al [<xref ref-type="bibr" rid="ref19">19</xref>] discussed a method for identifying pills using image processing methods. In 2016, the Pill Image Recognition Challenge was held in the United States, and 7000 images were provided as a data set for 1000 pills. Pill recognition models using this data set were actively studied, even after the conclusion of the competition [<xref ref-type="bibr" rid="ref20">20</xref>]. Zeng et al [<xref ref-type="bibr" rid="ref21">21</xref>] won the competition by proposing a model that utilized a triple network to distinguish between similar pills. Larios Delgado et al [<xref ref-type="bibr" rid="ref22">22</xref>] proposed a model that utilizes a deep convolutional neural network (CNN) and compared it with various image classification models pretrained on ImageNet. Chang et al [<xref ref-type="bibr" rid="ref23">23</xref>] proposed pill classification devices using a feature pyramid network with a 50-layer Residual Network (ResNet) as the backbone. However, they focused on real-time object detection, and the accuracy of pill identification was not a major consideration. The model proposed by Wong et al [<xref ref-type="bibr" rid="ref16">16</xref>] used a GoogLeNet Inception network [<xref ref-type="bibr" rid="ref24">24</xref>] based on a CNN and input images captured by the authors under various conditions. The model in the study by Wang et al [<xref ref-type="bibr" rid="ref25">25</xref>] used AlexNet [<xref ref-type="bibr" rid="ref26">26</xref>] and focused on the geometric transformation of pills in images. In previous studies on the identification of pills, all pill types in the test data set were included in the training data set with different images. Consequently, the models proposed in previous studies have 2 critical issues: (1) they suffer from a lack of generalizability in data sets that differ from their training data set, and (2) they find it difficult to identify newly approved pills.</p>
        <p>Previous studies have commonly preprocessed pill images and then used image classification models to learn the surface area information of pills to perform classification tasks within a data set. Despite their diversity, several pills share limited information on appearance features (ie, shape, color, and form). For example, many pills take the form of white circular tablets. This reduces the accuracy of pill identification and is a challenging factor in pill search models. Therefore, to increase the accuracy of pill identification, information on the characters imprinted on the pills can play a critical role. Researchers have also been aware of the importance of these characters and have attempted to solve this problem through image preprocessing. Traditional pill search models [<xref ref-type="bibr" rid="ref21">21</xref>,<xref ref-type="bibr" rid="ref22">22</xref>] rely only on image classification methodologies, even if imprinted data are preprocessed through grayscale transformations, lighting control, or noise removal in images. However, it is difficult to distinguish similar character sequences such as “MIO” and “M10” because they are likely to be recognized as the same image when the shapes of the characters are similar. In addition, previous methods [<xref ref-type="bibr" rid="ref13">13</xref>,<xref ref-type="bibr" rid="ref17">17</xref>-<xref ref-type="bibr" rid="ref23">23</xref>,<xref ref-type="bibr" rid="ref25">25</xref>] impose the burden of updating the model whenever a new drug is developed because they only include registered pills in the evaluation data set, which leads to poor generalization. Meanwhile, there is a study that considered the characters imprinted on pills and used national drug codes for the identification of pills [<xref ref-type="bibr" rid="ref27">27</xref>]. However, because the recognition of pill imprints depends solely on the image classification model (ResNet-18), there is a high probability of incorrect character recognition, which can lead to the incorrect prediction of drug codes, as in previous studies.</p>
      </sec>
      <sec>
        <title>Goal of This Study</title>
        <p>Our method considers the imprinted characters on pills as crucial information for pill identification. We adopted a character-level language model and convolutional networks for recognizing other features (ie, shape, color, and form). In addition, we divided the types of pills in the training and evaluation data sets to improve generalizability and thus the identification of new pills. We overcame the limitations of the existing pill search models by designing a system that focuses on imprinted characters. First, the object detection model <italic>You Only Look Once</italic> (YOLO) [<xref ref-type="bibr" rid="ref28">28</xref>] version 5 [<xref ref-type="bibr" rid="ref29">29</xref>] was used to learn the locations and types of imprinted characters in a pill image. Next, the object recognition model (ie, ResNet-32) was used to learn the shape, color, and form of the pill [<xref ref-type="bibr" rid="ref30">30</xref>]. In addition, we drew inspiration from the natural language processing field and considered the features of pills as context to learn the imprinted characters on pills in units of alphabet and number. In this study, the appearance of the pill (ie, shape, color, and form) is defined as <italic>features</italic>, and features and the imprinted characters are collectively referred to as <italic>characteristics</italic>.</p>
        <p>The character-level language model receives the characters detected by the object detection model and modifies them to match the shape and order of the characters on the pill. For example, when it receives an input of “MOI,” it predicts the next letter after “M” based on the appearance of the pill (ie, features) and corrects it. We separately trained an imprint recognition model that extracts imprints from images of pills and an imprint correction model that corrects characters based on the context of recurrent neural networks (RNNs) [<xref ref-type="bibr" rid="ref31">31</xref>-<xref ref-type="bibr" rid="ref35">35</xref>]. Moreover, the features of pills were utilized as a context of imprints in the character-level language model. Consequently, a successful pill identification ability was established without complicated preprocessing of images for imprint identification. We noted that the character correction module is important because pill images may contain important information such as medicinal ingredients, amounts of ingredients, and pharmaceutical company names. We considered a total 24,404 pills: 20,517 (84.07%) pills using the data provided by the MFDS in South Korea and 3887 (15.93%) pill samples from the National Library of Medicine (NLM) [<xref ref-type="bibr" rid="ref20">20</xref>] database in the United States. The main contributions of this study are described in <xref ref-type="boxed-text" rid="box1">Textbox 1</xref>.</p>
        <boxed-text id="box1" position="float">
          <title>A summary of the main contributions and results of this study.</title>
          <list list-type="bullet">
            <list-item>
              <p>We proposed a pill identification system based on a deep learning approach. The proposed model extracts 3 features (ie, shape, color, and form) and imprinted characters from a given pill image and retrieves results from a database, even for pills that are not in the training data set.</p>
            </list-item>
            <list-item>
              <p>We incorporated a character-level language model into the proposed pill identification system. To the best of our knowledge, we are the first to introduce a language model into a pill identification system. Moreover, we proposed a novel coordinate encoding technique for imprinted characters on pills. Our model extracts the features of pills and uses them for classification as well as for context in the language model to improve the performance.</p>
            </list-item>
            <list-item>
              <p>We confirmed that our model is robust to newly approved pills and generalizable on 2 different databases: one from the Ministry of Food and Drug Safety (MFDS; South Korea) and the other from the National Library of Medicine (NLM; the United States). Our model is also applicable to consumers because it can evaluate consumer images.</p>
            </list-item>
            <list-item>
              <p>We evaluated the proposed system with the MFDS and NLM data sets. Our system achieved 85.65% and 74.46% accuracy of the top-1 candidate for the types of pills not used in the training for the MFDS and NLM data sets, respectively. Moreover, our model (78%) outperformed the baseline (76.9%) in evaluating consumer images.</p>
            </list-item>
          </list>
        </boxed-text>
      </sec>
      <sec>
        <title>Organization</title>
        <p>The remainder of this paper has been organized as follows. In the next section, we introduce and analyze pill databases. We then discuss the overall process of the proposed system and the implementation of each module in the recognition unit and search units. In the <italic>Results</italic> section, the experimental setups, analyses, and results are presented. We have evaluated the proposed system on the types of pills that were not used in the training and demonstrated its ability to identify newly approved pills using reference images in the MFDS and NLM databases. In addition to the consumer images with varying lighting conditions in the NLM database, we have highlighted the effectiveness of the proposed system by comparing it with a state-of-the-art model. Finally, we discuss the results and conclusions.</p>
      </sec>
    </sec>
    <sec sec-type="methods">
      <title>Methods</title>
      <sec>
        <title>Database</title>
        <p>In total, 2 data sets of pill images were utilized in this study. The database provided by the MFDS, which included sample images of 20,517 (N=24,404, 84.07%) pills, was used for the experiments. In this database, each pill has one reference image, and each image includes both the front and rear photos of the pill (<xref rid="figure1" ref-type="fig">Figure 1</xref>A). Of 20,517, we separated the samples into 8000 (38.99%) samples for a training data set and 12,517 (61.01%) samples for a test data set. To evaluate the system’s performance, we constructed the test data set with pills that were completely different from those in the training data set. We adopted this setup by assuming that new pills are updated in the database. Another database provided by the NLM in the United States, which is widely used in pill recognition tasks, was also adopted for evaluating whether the system can be applied to different data sets. This database includes various forms of images of pills. As a result, images from the NLM database, which consist of both faces of pills without additional information, for each of the 15.93% (3887/24,404) of pills were used for the experiments (<xref rid="figure1" ref-type="fig">Figure 1</xref>B). We fine-tuned the model using 25.73% (1000/3887) of reference images, and the remaining 74.27% (2887/3887) of reference images were used for the evaluation. Furthermore, we also evaluated the model with consumer-grade pill images given by the NLM to examine its robustness in terms of more challenging inference and generalization ability. In the NLM database, the size, position, and lighting of the consumer-grade pill images were not adjusted (<xref rid="figure1" ref-type="fig">Figure 1</xref>C) In the evaluation using consumer-grade pill images (hereinafter consumer images), the model was fine-tuned with randomly chosen 25.73% (1000/3887) of reference images. The remaining 74.27% (2887/3887) of pills were not used in the training data set but were used in the test data set. It should be noted that the 25.73% (1000/3887) of pill species in the reference image samples in the training data set are the same as the 25.73% (1000/3887) of pill species in the test sets but the images are different. In this work, we consider only uppercase characters, except for “mg” (ie, milligram), for analysis to improve the identification of characters. We discuss this issue in the limitations section.</p>
        <p>In the MFDS database, the shapes can be classified into 10 types: round, oblong, oval, triangle, square, diamond, pentagon, hexagon, octagon, and others. “Others” includes unusual shapes such as semicircle, adjacent circle, bullet shape, rectangular shape with a concave center, heart shape, etc. As for colors, there were 16 categories: white, yellow, orange, pink, red, brown, light green, green, cyan, blue, navy, purple, gray, black, violet, and transparent. Finally, the forms included were tablets and capsules (<xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). We found that white circular tablets were the most prominent in the data set. In the NLM database, the most common shapes of pills were circular (1795/3887, 46.18%), followed by oval (1361/3887, 35.01%) and oblong (601/3887, 15.46%). There were &#60;1% of the following shapes: square, triangle, pentagon, hexagon, octagon, diamond, semicircle, rectangle, and “others.” In the training data set for fine-tuning, we mapped the rectangular shape to square and others (eg, tear, double circle, or trapezoid) to “others.” The most common color was white (1643/3887, 42.27%), followed by yellow (493/3887, 12.68%), pink (347/3887, 8.93%), orange (336/3887, 8.64%), blue (328/3887, 8.44%), brown (224/3887, 5.76%), and green (206/3887, 5.53%). Less than 5% of the pills were red, purple, gray, turquoise, or black in color. The proportions of capsules and tablets were 15% and 85%, respectively.</p>
        <fig id="figure1" position="float">
          <label>Figure 1</label>
          <caption>
            <p>(A) An example of the reference image from the Ministry of Food and Drug Safety database is shown. (B) An example of the reference image from the National Library of Medicine database is shown. (C) An example of the consumer image from the National Library of Medicine database is shown.</p>
          </caption>
          <graphic xlink:href="jmir_v25i1e41043_fig1.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Model Architecture</title>
        <p>The proposed framework is illustrated in <xref rid="figure2" ref-type="fig">Figure 2</xref>. In the pill recognition step, a pill image is fed into the object detection model YOLOv5 [<xref ref-type="bibr" rid="ref28">28</xref>,<xref ref-type="bibr" rid="ref29">29</xref>] and image classification model ResNet-32 [<xref ref-type="bibr" rid="ref30">30</xref>]. YOLOv5 extracts characters and their coordinates from the image, whereas ResNet-32 recognizes and classifies the pill’s shape, color, and form. The shape, color, form, and characters with their coordinates are then used as inputs for the RNN-based character-level language model [<xref ref-type="bibr" rid="ref31">31</xref>-<xref ref-type="bibr" rid="ref35">35</xref>]. In the character-level language model, the input characters are corrected using the coordinates of the characters, with the shape, color, and form of the pills acting as contexts. In the pill retrieval step, the similarity is calculated and then ranked by comparing the features (ie, shape, color, and form) extracted from ResNet-32 and imprints corrected from the RNN with the characteristics (ie, features and imprints) of the pills in the databases.</p>
        <fig id="figure2" position="float">
          <label>Figure 2</label>
          <caption>
            <p>A pipeline of our system to recognize the pills’ characteristics and retrieve from database. IKOIM denotes an example of detected imprint from the raw image and M10_KI denotes an example of corrected imprint from IKOIM.</p>
          </caption>
          <graphic xlink:href="jmir_v25i1e41043_fig2.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Pill Recognition</title>
        <sec>
          <title>Imprint Detection</title>
          <p>In this section, we have described the imprint detection module in the pill recognition step shown in <xref rid="figure2" ref-type="fig">Figure 2</xref> and how it is trained. In this module, we used YOLO as a text detection model. YOLO is an end-to-end model widely used in object detection [<xref ref-type="bibr" rid="ref28">28</xref>]. In this model, when an input image is fed to multiple convolutional layers, the model classifies objects by predicting their bounding boxes in the image. The model can detect not only objects but also characters in real time. Because this model can detect texts at a rate of approximately 45 frames per second, YOLO-based text detection models have been utilized in various fields such as robotics [<xref ref-type="bibr" rid="ref36">36</xref>], industrial automation [<xref ref-type="bibr" rid="ref37">37</xref>], and image searching [<xref ref-type="bibr" rid="ref38">38</xref>]. In this study, we selected YOLOv5 [<xref ref-type="bibr" rid="ref29">29</xref>] for detecting imprinted characters on pills for real-time inference.</p>
          <p>We adopted a pretrained YOLOv5 based on ImageNet [<xref ref-type="bibr" rid="ref39">39</xref>] and fine-tuned the model using the pill images from our training data sets. Because pill images have different angles depending on pill shapes, the images are automatically rotated to place each pill horizontally using OpenCV for Python. Because there are no ground truth labels for the bounding boxes and coordinates of the characters in pill images, we annotated the labels for the bounding boxes to construct a fine-tuning data set specialized for text detection in pill images. <xref rid="figure3" ref-type="fig">Figure 3</xref> shows a sample of the labeled training data set for this module.</p>
          <p>Although character recognition using the YOLO model exhibits good accuracy, the order of characters is not considered. Furthermore, despite the high accuracy of character recognition, the network tends to confuse alphabetical characters and numbers, such as “O” and “0” or “I” and “1.” Distinguishing such characters is critical for correctly identifying pills. We noted that the misclassification of characters is a major factor leading to the poor accuracy of the previous pill identification systems that use image classification models alone. Therefore, we developed the character correction module to correct the characters extracted by the imprint detection module. We describe the correction process in detail in the imprint correction section.</p>
          <fig id="figure3" position="float">
            <label>Figure 3</label>
            <caption>
              <p>A sample of the labeled training data set for the imprint detection module is presented.</p>
            </caption>
            <graphic xlink:href="jmir_v25i1e41043_fig3.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </fig>
        </sec>
        <sec>
          <title>Feature Recognition</title>
          <p>In this section, we demonstrate the process of feature extraction using ResNet and a multitask learning method. ResNet [<xref ref-type="bibr" rid="ref30">30</xref>] was the first model to outperform humans in the image classification competitions ImageNet Large Scale Visual Recognition Challenge in 2015 [<xref ref-type="bibr" rid="ref40">40</xref>] and Common Objects in Context in 2015 [<xref ref-type="bibr" rid="ref41">41</xref>]. This model was implemented for more accurate training based on the concept of residual learning. We trained ResNet-32, which consists of 32 layers, to recognize the shapes, colors, and forms of pills among the features in the input image.</p>
          <p>We trained the model to recognize 11 shapes (<italic>s</italic>), 16 colors (<italic>c</italic>), and 2 forms (<italic>f</italic>), as described in the database section. ResNet-32 uses an image of a pill as an input and produces a vector output of the hidden dimension size (<italic>h</italic>) characterizing the pill. This vector is used to produce total feature vectors by propagating to the weight matrices <italic>w[s]</italic>, <italic>w[c]</italic>, and <italic>w[f]</italic> with sizes of <italic>h</italic> × <italic>s</italic>, <italic>h</italic> × <italic>c</italic>, and <italic>h</italic> × <italic>f</italic> to express the shapes, colors, and forms, respectively. For each item, 3 loss functions were calculated, and the total loss function was obtained by summing the 3 loss functions. The loss function was used to determine the error of the system prediction during training, and the model was optimized by minimizing the error. On the basis of the total loss, the weights in ResNet-32, including those of each feature, were updated.</p>
          <p>When an image of a pill <italic>i</italic> produces a feature output as <italic>p[i]</italic> from the convolutional layer in ResNet-32, we obtain <italic>z[i,s]=softmax(w[s]p[i])</italic>, which contains the feature information of pill <italic>i</italic>, using a softmax function in the shape extraction module. The loss of the shape extraction module is determined by cross-entropy loss. The color has 16 categories, but a capsule often has more than one color; therefore, we selected 1 color for each capsule and calculated the loss in the form of cross-entropy loss. Therefore, the result corresponding to the color <italic>j</italic> for the pill <italic>i</italic> is <italic>z[i,c]=softmax(w[c]p[i])</italic> Finally, we divided the pill forms into tablets and capsules. The form information for the pill <italic>i</italic> is as follows: <italic>z[i,f]=softmax(w[f]p[i])</italic>. The loss functions of each feature are as follows: </p>
          <disp-formula>
            <graphic xlink:href="jmir_v25i1e41043_fig5.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </disp-formula>
          <p>where <italic>k</italic> can be <italic>s</italic>, <italic>c,</italic> or <italic>f</italic>, which denote the shape, color, or form, respectively, and <italic>N</italic> denotes the number of species for each feature (11 for <italic>s</italic>, 16 for <italic>c</italic>, or 2 for <italic>f</italic>). Here, <italic>I[i,j,k]</italic> ∊(0, 1) is the ground truth label for shape, color, and form. As a result, the total loss of the shape extraction module consists of all losses multiplied by hyperparameter weights for each loss. We set all weights to 1, where the values were determined empirically.</p>
        </sec>
        <sec>
          <title>Imprint Correction</title>
          <p>We encoded the imprinted characters and features of a pill into the model by considering them as a sequence and context, respectively. Then, we trained the model to correct the characters to achieve a desired outcome. It is known that the imprints on a pill contain information regarding its ingredients, the amount of each ingredient, and the names of the medication companies. This information is important for the identification of pills. Because there is a tendency to write a series of numerical sequences on pill components and their amounts are similar to each other, the imprint calibration module can be effectively trained. The characters and their coordinates extracted by the imprint detection module (refer to the imprint detection section) are inputted into the imprint correction module. We propose a character-level RNN [<xref ref-type="bibr" rid="ref31">31</xref>-<xref ref-type="bibr" rid="ref35">35</xref>] with a gated recurrent unit mechanism [<xref ref-type="bibr" rid="ref42">42</xref>] as a language model for correcting imprinted characters on pills. In this process, we propose a novel coordinate encoding technique, considering the specificity that the data are pills rather than a generic corpus. Coordinate encoding is performed to extract the position of each letter in the pill image as coordinates and enter it into the model as one piece of information for encoding it. We not only modified the imprinted characters on the pill with high accuracy but also effectively modified the order of the characters using their coordinates through a character correction module that supplements coordinate encoding techniques.</p>
          <p>We considered an attention-based bidirectional sequence-to-sequence model [<xref ref-type="bibr" rid="ref35">35</xref>] with a many-to-many RNN structure to correct individual characters and character orders. We were motivated by machine translation and adopted a similar system to correct imprinted characters on pills. For a character <italic>t</italic> with 2D coordinates (<italic>x<sub>t</sub></italic> and <italic>y<sub>t</sub></italic>) the input vector representation is a concatenation of the coordinate values and one-hot encoding of <italic>t</italic>. The representation of an input sequence is generated by the encoder and passed to the decoder. In the decoder, 1 character is generated at a time based on an attention weight that is calculated based on the character in the decoder and all the hidden states for each character in the encoder. Therefore, the conditional probability is calculated as follows: </p>
          <disp-formula>
            <graphic xlink:href="jmir_v25i1e41043_fig6.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </disp-formula>
          <p>where tgt and src denote the target and source of the characters, respectively, with <italic>m</italic> digits of imprints. In addition, we distinguished characters on the front and back of the pills using an underscore (_). Therefore, the model should predict the exact characters and their absolute orders in the imprint. For example, if a given input extracted by the imprint detection module is “IKOIM” with coordinates for each character, then the target output is “M10_KI,” as shown in <xref rid="figure3" ref-type="fig">Figure 3</xref>. The features of the pills extracted by the feature recognition module can be used in the imprint correction module. We proved that the correction module improves the accuracy of pill identification, as described in the ablation study section.</p>
        </sec>
      </sec>
      <sec>
        <title>Pill Retrieval</title>
        <sec>
          <title>Imprint Similarity Score</title>
          <p>Our system infers pill classes based on the results of each module for a test image. We computed similarity scores between each imprint predicted from the test image and those of all the pills found in the database. After we added similarity scores from features, we obtained the top-1 and top-3 candidates by combining the similarity scores of 3 features and imprints.</p>
          <p>We utilized the edit distance to reflect the output of the language model and the similarity scores of the imprints on pills in the database for pill retrieval. The edit distance is a measure that represents how similar 2 strings are and calculates the number of operations to be performed before one string is equal to another [<xref ref-type="bibr" rid="ref43">43</xref>,<xref ref-type="bibr" rid="ref44">44</xref>]. These operations include insertion, deletion, and substitution. Furthermore, the order of listing characters was also considered. We converted the edit distance into a similarity score and normalized it as follows: </p>
          <disp-formula>
            <graphic xlink:href="jmir_v25i1e41043_fig7.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </disp-formula>
          <p>For example, if the output of the imprint correction module is “M10_KI,” then the edit distance from a target pill “M10SPC_” is 4 (3 exchanges and 1 insertion), and the normalized similarity is 6/13. Note that the underscore (_) in the imprints indicates the separation between the front and back sides. In addition, we added the information from the imprint detection module to the scores by calculating the overlap between the output of the imprint correction module (character-level language model) and the imprinted characters on the target pill, depending on the text length. We scored the overlapping characters as follows: </p>
          <disp-formula>
            <graphic xlink:href="jmir_v25i1e41043_fig8.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </disp-formula>
        </sec>
        <sec>
          <title>Feature Similarity Score</title>
          <p>The weight of each feature (ie, shape, color, and form) was set equally in the similarity scores. However, we assumed that the similarity scores for imprints are even more crucial for identifying pills. Therefore, we assigned different weights when calculating the similarity between features and imprints. By contrast, in multilabel retrieval studies, similarity scores are mainly utilized, but probabilities are generally used. However, because our targets have different distributions for each label, unlike multilabel data, we set the similarity score weights for the 3 feature types to 1/3 only for the exact match of the labels. For example, if the output of the model is [square, orange, tablet], then a pill with [square, pink, tablet] has 2/3 points, a pill with [triangle, orange, capsule] has 1/3 points, and a pill with [round, yellow, capsule] has 0 points.</p>
          <p><xref ref-type="table" rid="table1">Table 1</xref> presents an example of the calculation of the similarity scores for each pill in the database. Imprint similarity scores are the last 2 terms written in italics under scores, which range from 0 to 1. One is based on the edit distance, and the other is based on the number of overlapping characters between the output of the character-level language model and the target character sequences. According to <xref ref-type="table" rid="table1">Table 1</xref>, even if the other features are the same as the source, as is the case for target 3, the score is lower than if the imprinted characters are more similar. Because the correct pill corresponding to the source is target 1, we can conclude that accurate imprint recognition is the most important aspect of pill identification.</p>
          <table-wrap position="float" id="table1">
            <label>Table 1</label>
            <caption>
              <p>Example of similarity scores for 3 target pills in the database with given source information<sup>a</sup> extracted by the proposed system.</p>
            </caption>
            <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
              <col width="130"/>
              <col width="420"/>
              <col width="450"/>
              <thead>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Characteristics of target pills</td>
                  <td>Scores</td>
                </tr>
              </thead>
              <tbody>
                <tr valign="top">
                  <td>Target 1</td>
                  <td>[square, pink, tablet, M10_KI]</td>
                  <td>1/3 + 0 + 1/3 + <italic>1</italic> + <italic>1</italic> = 2.67</td>
                </tr>
                <tr valign="top">
                  <td>Target 2</td>
                  <td>[square, pink, tablet, M10_Kb]</td>
                  <td>1/3 + 0 + 1/3 + <italic>10/12</italic> + <italic>10/12</italic> = 2.33</td>
                </tr>
                <tr valign="top">
                  <td>Target 3</td>
                  <td>[square, orange, tablet, M10SPC_]</td>
                  <td>1/3 + 1/3 + 1/3 + <italic>6/13</italic> + <italic>8/13</italic> = 2.08</td>
                </tr>
              </tbody>
            </table>
            <table-wrap-foot>
              <fn id="table1fn1">
                <p><sup>a</sup>Source: [square, orange, tablet, M10_KI].</p>
              </fn>
            </table-wrap-foot>
          </table-wrap>
        </sec>
      </sec>
    </sec>
    <sec sec-type="results">
      <title>Results</title>
      <sec>
        <title>Overall Experiments</title>
        <p>We evaluated the developed pill identification system in terms of the accuracy of top-1 and top-3 similarity score candidates. For convenience, we defined the probability that the answer is in the top-1 and top-3 candidates as top-1 and top-3 accuracy. Note that previous works considered top-1 and top-5 candidates [<xref ref-type="bibr" rid="ref22">22</xref>], but we chose top-3 candidates instead of top-5 candidates because our system has similar accuracy for top-3 and top-5 candidates. First, we trained our model on the reference images of 38.99% (8000/20,517) of kinds of pills in the MFDS database. Experiments to be evaluated with a trained system attempt to answer the following questions: (1) Can the model predict new species of pills that have not been used for training well? (2) Can the model be utilized for different types of data, not just the type of data it was trained on? Can the model predict new species of pills that have not been used in training? (3) Can the model predict images of consumer ratings in addition to stereotyped images? and (4) Does this contribute to the performance of our proposed language model for pill identification? We have proceeded with an evaluation to answer questions 1 to 3 in the model evaluation section and discussed question 4 in the ablation section.</p>
      </sec>
      <sec>
        <title>Model Evaluation</title>
        <p>We trained the system on the 38.99% (8000/20,517) of reference images from the MFDS database via oversampling to represent less frequently appearing colors, shapes, and forms (<xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). We evaluated the system using 61.01% (12,517/20,517) of images from the MFDS database as reference images and 74.27% (2887/3887) of images from the NLM database. Note that we fine-tuned the system on 25.73% (1000/3887) of reference images from the NLM database before evaluating the NLM images. Furthermore, after fine-tuning the system on 25.73% (1000/3887) of reference images from the NLM database, we evaluated it on 25.73% (1000/3887) of consumer images from the NLM database containing the same pills. In this case, simple image preprocessing such as concatenating pills horizontally was automatically applied to the consumer images. <xref rid="figure4" ref-type="fig">Figure 4</xref> illustrates the training, fine-tuning, and evaluation processes.</p>
        <p>During training, we experimentally established suitable hyperparameters according to each module. We implemented our system and baseline using PyTorch and trained our system on a single machine equipped with an AMD 12-core processor, 128 GB of RAM, and NVIDIA GeForce RTX 3090 with 11 GB of RAM for 2.3 hours. The imprint detection module was trained using a stochastic gradient descent optimizer with a learning rate of 10<sup>-2</sup>, 3 warm-up epochs, and a batch size of 16. The feature recognition module was trained for 100 epochs for 0.2 hours using a stochastic gradient descent optimizer with a learning rate of 0.1, weight decay of 10<sup>-4</sup> and batch size of 10 for 1.8 hours. Finally, the imprint correction module was trained for 100 epochs using an Adam optimizer with a learning rate of 10<sup>-3</sup> and batch size of 50 for 0.3 hours, where the embedding size and size of the hidden layer in the language model were 45 and 256, respectively.</p>
        <p>We first evaluated 61.01% (12,517/20,517) of sample pill images from the MFDS database and retrieved the pills from the database by computing the similarity scores between each evaluation image and all pill images in the test data set. The second and third columns in <xref ref-type="table" rid="table2">Table 2</xref> present the evaluation results in terms of the top-1 and top-3 candidate accuracies on the NLM database. We compared our system’s performance to that of a system using ResNet-50 (baseline) [<xref ref-type="bibr" rid="ref22">22</xref>]. The top-1 accuracy on the MFDS samples was 85.65%, and the top-3 accuracy was 92.35%. It is clearly possible for our system to identify pills that are not used in training because of the role of the retrieval system. By contrast, the baseline system performs classification, rather than retrieval, based on the labels of each pill. Therefore, the baseline systems could not identify pills that were not used in training.</p>
        <p>The results for 74.27% (2887/3887) of NLM reference images show that the top-1 accuracy is 74.46%, and the top-3 accuracy is 88.7%. This indicates that the proposed system can identify pills in 2 different databases with fine-tuning. This demonstrates the structural advantages of our system because it is feasible to apply another database by simply including a small number of sample images without making any structural changes to the system. Furthermore, this result implies that the proposed system does not rely heavily on a particular database.</p>
        <p>Next, we trained the model on only 25.73% (1000/3887) of reference images from the NLM database to evaluate the applicability of the model and compared it with the baseline on 1000 consumer images. We preprocessed the images by horizontally concatenating the front and back sides of the pills. Our system exhibited a top-1 accuracy of 78% and top-3 accuracy of 89.1%, which outperformed the baseline model that resulted in 76.9% top-1 accuracy and 81.1% top-3 accuracy. In addition, we confirmed that our system can identify a pill in real time by taking approximately 0.789 seconds in pill evaluation for consumer images.</p>
        <fig id="figure4" position="float">
          <label>Figure 4</label>
          <caption>
            <p>Training and test data sets for 3 evaluation experiments are shown. Deep blue bars, green bars, and an orange bar indicate the Ministry of Food and Drug Safety (MFDS) reference images, National Library of Medicine (NLM) reference images, and NLM consumer images, respectively.</p>
          </caption>
          <graphic xlink:href="jmir_v25i1e41043_fig4.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <table-wrap position="float" id="table2">
          <label>Table 2</label>
          <caption>
            <p>Accuracies of the proposed system in terms of the top result on 3 evaluations compared with the baseline [<xref ref-type="bibr" rid="ref22">22</xref>] are shown. Each result indicates the evaluation on the Ministry of Food and Drug Safety (MFDS) reference images, National Library of Medicine (NLM) reference images, and NLM consumer images. Top 1 and top 3 denote the probability that the answer is in the top-1 and top-3 candidates.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="180"/>
            <col width="140"/>
            <col width="140"/>
            <col width="140"/>
            <col width="140"/>
            <col width="130"/>
            <col width="130"/>
            <thead>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td colspan="2">MFDS database</td>
                <td colspan="2">NLM database</td>
                <td colspan="2">NLM consumer</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Top 1 (%)</td>
                <td>Top 3 (%)</td>
                <td>Top 1 (%)</td>
                <td>Top 3 (%)</td>
                <td>Top 1 (%)</td>
                <td>Top 3 (%)</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>Baseline system</td>
                <td>—<sup>a</sup></td>
                <td>—</td>
                <td>—</td>
                <td>—</td>
                <td>76.9</td>
                <td>81.1</td>
              </tr>
              <tr valign="top">
                <td>Our system</td>
                <td>85.65</td>
                <td>92.35</td>
                <td>74.46</td>
                <td>88.70</td>
                <td>78</td>
                <td>89.1</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table2fn1">
              <p><sup>a</sup>Not available.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
      </sec>
      <sec>
        <title>Ablation Study</title>
        <p>We evaluated the influence of the imprint correction module by ablating it from the imprint detection and feature recognition modules. This analysis aimed to examine our hypothesis that the imprint correction module is the most important for improving the accuracy of the system. We determined that the system without an imprint correction module struggled to identify pills in cases where the imprinted character string was long. In such cases, imprints are difficult to identify because of the nonlinear arrangement of imprinted characters or confusion between alphabetical letters and numbers. <xref ref-type="table" rid="table3">Table 3</xref> demonstrates that the accuracy increased by 3.3% to 11.3% when the imprint correction module was added. These results prove our hypothesis that many errors occur in the absence of additional treatment for characters.</p>
        <p>We used another ablation study on the similarity score to demonstrate the importance of imprints in pill images. <xref ref-type="table" rid="table4">Table 4</xref> shows the top-1 accuracy and top-3 accuracy of the proposed system when each characteristic (shape, color, form, and imprint) was missing in the similarity score. When the similarity score of imprints was omitted, the system showed a top-1 accuracy of 0%, indicating that the pill cannot be identified. Even among the top-3 candidates, the probability was only 0.05%.</p>
        <table-wrap position="float" id="table3">
          <label>Table 3</label>
          <caption>
            <p>The accuracies for the Ministry of Food and Drug Safety (MFDS) data set (left) and the National Library of Medicine (NLM) data set (right) are shown. The case without the imprint correction module is in the upper row (without language model [LM]), and the case with it is in the lower row (with LM).</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="200"/>
            <col width="200"/>
            <col width="200"/>
            <col width="200"/>
            <col width="200"/>
            <thead>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td colspan="2">MFDS database</td>
                <td colspan="2">NLM database</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Top 1 (%)</td>
                <td>Top 3 (%)</td>
                <td>Top 1 (%)</td>
                <td>Top 3 (%)</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>Without LM</td>
                <td>74.4</td>
                <td>83.9</td>
                <td>71.46</td>
                <td>84.68</td>
              </tr>
              <tr valign="top">
                <td>With LM</td>
                <td>85.7</td>
                <td>92.4</td>
                <td>74.76</td>
                <td>88.7</td>
              </tr>
            </tbody>
          </table>
        </table-wrap>
        <table-wrap position="float" id="table4">
          <label>Table 4</label>
          <caption>
            <p>The accuracies by ablating each characteristic (shape, color, form, and imprint) of pills in the similarity score for the Ministry of Food and Drug Safety data set are shown. “None” denotes our proposed system considering all characteristics in the similarity score.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="500"/>
            <col width="250"/>
            <col width="250"/>
            <thead>
              <tr valign="top">
                <td>Ablated characteristic</td>
                <td>Top 1 (%)</td>
                <td>Top 3 (%)</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>None</td>
                <td>85.65</td>
                <td>92.35</td>
              </tr>
              <tr valign="top">
                <td>Shape</td>
                <td>83.88</td>
                <td>91.59</td>
              </tr>
              <tr valign="top">
                <td>Color</td>
                <td>81.08</td>
                <td>89.74</td>
              </tr>
              <tr valign="top">
                <td>Form</td>
                <td>85.94</td>
                <td>92.29</td>
              </tr>
              <tr valign="top">
                <td>Imprint</td>
                <td>0</td>
                <td>0.05</td>
              </tr>
            </tbody>
          </table>
        </table-wrap>
      </sec>
    </sec>
    <sec sec-type="discussion">
      <title>Discussion</title>
      <sec>
        <title>Principal Findings</title>
        <p>This work demonstrates that incorporating language models into a deep learning–based pill identification system significantly improves the accuracy of the system and helps the system generalize to other data.</p>
        <p>We verified the generalization and robustness of our system by using different pill types for evaluation from those used in training. The experimental results demonstrated that our system could identify pills that were not used in training. Therefore, our system can identify newly enrolled pills without additional training. In addition, we evaluated 74.27% (2887/3887) of pill samples from the NLM database of the United States and 61.01% (12,517/20,517) of pill samples from the MFDS database of South Korea. The experimental results demonstrate that our model does not rely on a particular database. Furthermore, the proposed system can reduce database dependence and achieve high performance on 2 different databases because it learns the characteristics of pills by segmenting them into 3 features (color, shape, and forms) and imprinted characters and measures the similarities of each predicted characteristic to characteristics of target pills. We conducted experiments on identifying consumer images while training our system on reference images for comparison with the baseline and a discussion of the system’s applicability. Because the consumer images in the NLM database have various lighting conditions, the pill identification task was more challenging. In this evaluation, our model outperformed the baseline, as shown in <xref ref-type="table" rid="table2">Table 2</xref>.</p>
        <p>The ablation study section determined that the imprint correction module can significantly improve identification performance. <xref ref-type="table" rid="table3">Table 3</xref> presents the results with and without the imprint correction module based on the characters and their coordinates extracted by the imprint detection module. An RNN, which exhibits good performance for deep learning–based machine translation, is used for correcting imprinted characters as a character-level language model. In the absence of this module, the system uses the initial results from imprint detection to calculate similarity scores. The results in <xref ref-type="table" rid="table3">Table 3</xref> prove our hypothesis that pills can be identified more accurately by implementing an imprint correction module in the form of a language model. Pills are difficult to identify based solely on their shape, color, and form. Rather, it is important for pill identification to accurately detect imprinted characters because imprints contain information regarding pill ingredients, their amounts, and the pill manufacturer. To the best of our knowledge, ours is the first attempt to extract each character and its coordinates separately and then correct them using a language model in the pill identification system. In addition, rather than simply predicting and classifying the labels of the pills, the similarity scores with the pills in the database were calculated by considering different characteristics and assigning larger weight values to the imprints. Moreover, in another ablation study, as shown in <xref ref-type="table" rid="table4">Table 4</xref>, imprint was shown to have a crucial role in identifying pills. By contrast, form makes little contribution to the identification of pills.</p>
      </sec>
      <sec>
        <title>Comparison With Prior Work</title>
        <p><xref ref-type="table" rid="table2">Table 2</xref> presents the results of a state-of-the-art system [<xref ref-type="bibr" rid="ref22">22</xref>] using ResNet-50 and a classification system and compares them with those of our system obtained in the same environment. The former systems [<xref ref-type="bibr" rid="ref13">13</xref>,<xref ref-type="bibr" rid="ref17">17</xref>-<xref ref-type="bibr" rid="ref23">23</xref>,<xref ref-type="bibr" rid="ref25">25</xref>] identify pills by extracting a single feature from the image of a pill and then performing classification based on class labels via a softmax function. By contrast, our system distinguishes the features of the pill by color, shape, form, and imprint and then retrieves the matching pills from a database. Our system detects characters and their coordinates and then corrects the result more precisely, unlike previous studies that did not consider imprints carefully. We used ResNet, which is also used in the baseline, but considered fewer layers than the baseline (50 vs 32). We added separate modules designed to extract features and imprints to establish a much more specific and accurate identification process. We found that the system proposed by Zeng et al [<xref ref-type="bibr" rid="ref21">21</xref>] used the same data set as that used by Larios Delgado et al [<xref ref-type="bibr" rid="ref22">22</xref>]. However, we chose the system from Larios Delgado et al [<xref ref-type="bibr" rid="ref22">22</xref>] as the baseline because its performance was superior. We used 2 different databases to demonstrate the effectiveness of the proposed system. We trained the model with 38.99% (8000/20,517) of pill samples from the database provided by the MFDS in South Korea for our first 2 experiments. The evaluation was conducted on 61.01% (12,517/20,517) of MFDS samples that were completely different from those used for training. There is a system that achieved excellent pill identification results, but only 500 pills were used for both training and evaluation [<xref ref-type="bibr" rid="ref13">13</xref>]. The biggest difference between our system and previous systems is that our system can identify pills that are not used in training, whereas previous systems could not identify new pills. This is because we constructed our system as a retrieval system, rather than a classification system.</p>
      </sec>
      <sec>
        <title>Strengths and Limitations</title>
        <p>We introduce 2 strengths of the proposed system. First, the proposed system can identify new types of pills that have not been used in learning. Even if the new drug is approved and the database is updated, further learning of the system is not required. Second, the proposed system has a high generalization capability for the database. We pretrained the system with data from the MFDS and evaluated it by fine-tuning it with much less NLM data to confirm that the system shows promising results.</p>
        <p>However, our system has 3 limitations. First, we used only capital letters except for “mg” (milligram) as the imprinted letters of the pill. Pills with lowercase letters were difficult to learn because of the small amount of data and, consequently, were not used in training. Symbols were not used in training for the same reason. We will compensate these limitations by collecting more data from future studies. Second, we observed that the experimental results for the NLM data demonstrated slightly lower performance than the results for the MFDS data. This is largely attributed to the fact that the system is pretrained on large amounts of the MFDS data and fine-tuned on small amounts of NLM data. To maintain the knowledge from large data and exhibit stable performance in various databases, we will later attempt to introduce transfer learning techniques via multitask learning [<xref ref-type="bibr" rid="ref45">45</xref>] or adapter [<xref ref-type="bibr" rid="ref46">46</xref>]. Third, when extracting pills from the images, we extracted them separately from the background using an algorithm without applying a deep learning–based segmentation model. From the image of the database we used, the pill shape was extracted with high accuracy just by extracting it with an algorithm. However, because various backgrounds, shading, and some broken pills may exist in real-world images, the latest model based on deep learning is expected to be helpful. In the future, it is expected that the system can be improved by using photos of actual drugs and applying deep learning–based sedation through collaboration with research teams in the pharmaceutical field.</p>
      </sec>
      <sec>
        <title>Conclusions</title>
        <p>In this study, we hypothesized that the most important information in a pill image lies in imprinted characters and proposed a pill identification system utilizing imprinted characters. Unlike most pill identification systems that use only models specialized for image classification, we implemented a character-level language model to achieve high-accuracy pill recognition. To evaluate the robustness and generalizability of the proposed system, it was evaluated using 2 different pill databases in South Korea and the United States. Unlike the baseline systems, our system can identify pills that are not contained in the training data set. In addition, we conducted pill identification experiments on consumer images of pills that were only seen once during training to compare our system with the baseline. The baseline, which utilizes CNNs, has achieved outstanding performance among the existing systems. However, our system achieved a higher accuracy than the baseline. Moreover, our system shows high accuracy while taking less than 1 second to identify the pill, so it is expected to be utilized in devices such as previously proposed wearable devices [<xref ref-type="bibr" rid="ref23">23</xref>] or mobile apps [<xref ref-type="bibr" rid="ref19">19</xref>,<xref ref-type="bibr" rid="ref21">21</xref>]. Therefore, it is hoped that the study on AI-based pill identification systems can be advanced based on the foundation laid by our study.</p>
      </sec>
    </sec>
  </body>
  <back>
    <app-group>
      <supplementary-material id="app1">
        <label>Multimedia Appendix 1</label>
        <p>Distribution in training data set of the Ministry of Food and Drug Safety.</p>
        <media xlink:href="jmir_v25i1e41043_app1.png" xlink:title="PNG File , 702 KB"/>
      </supplementary-material>
    </app-group>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">AI</term>
          <def>
            <p>artificial intelligence</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb2">CNN</term>
          <def>
            <p>convolutional neural network</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb3">MFDS</term>
          <def>
            <p>Ministry of Food and Drug Safety</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb4">NLM</term>
          <def>
            <p>National Library of Medicine</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb5">OECD</term>
          <def>
            <p>Organisation for Economic Co-operation and Development</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb6">ResNet</term>
          <def>
            <p>Residual Network</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb7">RNN</term>
          <def>
            <p>recurrent neural network</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb8">YOLO</term>
          <def>
            <p>You Only Look Once</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <ack>
      <p>This work was supported by the National Research Foundation of Korea grant funded by the Ministry of Science and ICT (NRF-2021R1G1A1094236 and NRF-2022R1C1C1010317).</p>
    </ack>
    <notes>
      <sec>
        <title>Data Availability</title>
        <p>The data sets generated during and analyzed during this study are available from the corresponding author upon reasonable request. The code is available from the corresponding author upon reasonable request.</p>
      </sec>
    </notes>
    <fn-group>
      <fn fn-type="con">
        <p>JH, DHJ, and KMK conceptualized the study. JH collected the databases, refined the data, and developed the system. YK wrote the original manuscript. JH and YK analyzed and visualized the data. KMK, DHJ, and SL supervised this study and edited the manuscript. KMK and DHJ acquired the project funding. All the authors reviewed and approved the final version of the manuscript. JH and YK are co-first authors who equally contributed to this work. DHJ and KMK are co-corresponding authors who equally contributed to this work.
</p>
      </fn>
      <fn fn-type="conflict">
        <p>None declared.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Slawomirski</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Auraaen</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Klazinga</surname>
              <given-names>N</given-names>
            </name>
          </person-group>
          <article-title>The economics of patient safety</article-title>
          <source>Organisation for Economic Co-operation and Development (OECD)</source>
          <year>2017</year>
          <month>3</month>
          <access-date>2022-12-19</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.oecd.org/health/health-systems/The-economics-of-patient-safety-March-2017.pdf">https://www.oecd.org/health/health-systems/The-economics-of-patient-safety- March-2017.pdf</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <collab>Institute of Medicine</collab>
          </person-group>
          <source>Preventing Medication Errors</source>
          <year>2007</year>
          <publisher-loc>Washington, DC</publisher-loc>
          <publisher-name>The National Academies Press</publisher-name>
        </nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="web">
          <article-title>Reducing and preventing adverse drug events to decrease hospital costs</article-title>
          <source>PS NET</source>
          <year>2005</year>
          <month>3</month>
          <day>6</day>
          <access-date>2022-12-19</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://psnet.ahrq.gov/issue/reducing-and-preventing-adverse-drug-events-decrease-hospital-costs">https://psnet.ahrq.gov/issue/reducing-and-preventing-adverse-drug-events-decrease-hospital-costs</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Weiss</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Freeman</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Heslin</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Barrett</surname>
              <given-names>ML</given-names>
            </name>
          </person-group>
          <article-title>Adverse drug events in U.S. Hospitals, 2010 versus 2014</article-title>
          <source>Agency for Healthcare Research and Quality</source>
          <year>2018</year>
          <access-date>2022-12-19</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.hcup-us.ahrq.gov/reports/statbriefs/sb234-Adverse-Drug-Events.pdf">https://www.hcup-us.ahrq.gov/reports/statbriefs/sb234-Adverse-Drug-Events.pdf</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Chang</surname>
              <given-names>TI</given-names>
            </name>
            <name name-style="western">
              <surname>Park</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>DW</given-names>
            </name>
            <name name-style="western">
              <surname>Jeon</surname>
              <given-names>EK</given-names>
            </name>
            <name name-style="western">
              <surname>Rhee</surname>
              <given-names>CM</given-names>
            </name>
            <name name-style="western">
              <surname>Kalantar-Zadeh</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Kang</surname>
              <given-names>EW</given-names>
            </name>
            <name name-style="western">
              <surname>Kang</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Han</surname>
              <given-names>SH</given-names>
            </name>
          </person-group>
          <article-title>Polypharmacy, hospitalization, and mortality risk: a nationwide cohort study</article-title>
          <source>Sci Rep</source>
          <year>2020</year>
          <month>11</month>
          <day>03</day>
          <volume>10</volume>
          <issue>1</issue>
          <fpage>18964</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1038/s41598-020-75888-8"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/s41598-020-75888-8</pub-id>
          <pub-id pub-id-type="medline">33144598</pub-id>
          <pub-id pub-id-type="pii">10.1038/s41598-020-75888-8</pub-id>
          <pub-id pub-id-type="pmcid">PMC7609640</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref6">
        <label>6</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Melnyk</surname>
              <given-names>PS</given-names>
            </name>
            <name name-style="western">
              <surname>Shevchuk</surname>
              <given-names>YM</given-names>
            </name>
            <name name-style="western">
              <surname>Remillard</surname>
              <given-names>AJ</given-names>
            </name>
          </person-group>
          <article-title>Impact of the dial access drug information service on patient outcome</article-title>
          <source>Ann Pharmacother</source>
          <year>2000</year>
          <month>05</month>
          <volume>34</volume>
          <issue>5</issue>
          <fpage>585</fpage>
          <lpage>92</lpage>
          <pub-id pub-id-type="doi">10.1345/aph.19173</pub-id>
          <pub-id pub-id-type="medline">10852084</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref7">
        <label>7</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>LeBlanc</surname>
              <given-names>JM</given-names>
            </name>
            <name name-style="western">
              <surname>Seoane-Vazquez</surname>
              <given-names>EC</given-names>
            </name>
            <name name-style="western">
              <surname>Arbo</surname>
              <given-names>TC</given-names>
            </name>
            <name name-style="western">
              <surname>Dasta</surname>
              <given-names>JF</given-names>
            </name>
          </person-group>
          <article-title>International critical care hospital pharmacist activities</article-title>
          <source>Intensive Care Med</source>
          <year>2008</year>
          <month>03</month>
          <volume>34</volume>
          <issue>3</issue>
          <fpage>538</fpage>
          <lpage>42</lpage>
          <pub-id pub-id-type="doi">10.1007/s00134-007-0918-2</pub-id>
          <pub-id pub-id-type="medline">17987280</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref8">
        <label>8</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Costerison</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Graham</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Developing and promoting an intranet site for a drug information service</article-title>
          <source>Am J Health Syst Pharm</source>
          <year>2008</year>
          <month>04</month>
          <day>01</day>
          <volume>65</volume>
          <issue>7</issue>
          <fpage>639</fpage>
          <lpage>43</lpage>
          <pub-id pub-id-type="doi">10.2146/ajhp070318</pub-id>
          <pub-id pub-id-type="medline">18359972</pub-id>
          <pub-id pub-id-type="pii">65/7/639</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref9">
        <label>9</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Choi</surname>
              <given-names>JH</given-names>
            </name>
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Shanmugam</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Yong</surname>
              <given-names>CS</given-names>
            </name>
            <name name-style="western">
              <surname>Choi</surname>
              <given-names>HG</given-names>
            </name>
            <name name-style="western">
              <surname>Yoo</surname>
              <given-names>BK</given-names>
            </name>
          </person-group>
          <article-title>Analysis on drug identification service and other drug-related queries in a hospital pharmacy</article-title>
          <source>YAKHAK HOEJI</source>
          <year>2008</year>
          <volume>52</volume>
          <issue>4</issue>
          <fpage>283</fpage>
          <lpage>7</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref10">
        <label>10</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Tefera</surname>
              <given-names>YG</given-names>
            </name>
            <name name-style="western">
              <surname>Gebresillassie</surname>
              <given-names>BM</given-names>
            </name>
            <name name-style="western">
              <surname>Ayele</surname>
              <given-names>AA</given-names>
            </name>
            <name name-style="western">
              <surname>Belay</surname>
              <given-names>YB</given-names>
            </name>
            <name name-style="western">
              <surname>Emiru</surname>
              <given-names>YK</given-names>
            </name>
          </person-group>
          <article-title>The characteristics of drug information inquiries in an Ethiopian university hospital: a two-year observational stud</article-title>
          <source>Sci Rep</source>
          <year>2019</year>
          <month>09</month>
          <day>25</day>
          <volume>9</volume>
          <issue>1</issue>
          <fpage>13835</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1038/s41598-019-50204-1"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/s41598-019-50204-1</pub-id>
          <pub-id pub-id-type="medline">31554837</pub-id>
          <pub-id pub-id-type="pii">10.1038/s41598-019-50204-1</pub-id>
          <pub-id pub-id-type="pmcid">PMC6761201</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref11">
        <label>11</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Almazrou</surname>
              <given-names>DA</given-names>
            </name>
            <name name-style="western">
              <surname>Ali</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Alzhrani</surname>
              <given-names>JA</given-names>
            </name>
          </person-group>
          <article-title>Assessment of queries received by the drug information center at King Saud Medical City</article-title>
          <source>J Pharm Bioallied Sci</source>
          <year>2017</year>
          <volume>9</volume>
          <issue>4</issue>
          <fpage>246</fpage>
          <lpage>50</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://www.jpbsonline.org/article.asp?issn=0975-7406;year=2017;volume=9;issue=4;spage=246;epage=250;aulast=Almazrou"/>
          </comment>
          <pub-id pub-id-type="doi">10.4103/jpbs.JPBS_166_17</pub-id>
          <pub-id pub-id-type="medline">29456375</pub-id>
          <pub-id pub-id-type="pii">JPBS-9-246</pub-id>
          <pub-id pub-id-type="pmcid">PMC5810074</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref12">
        <label>12</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Park</surname>
              <given-names>U</given-names>
            </name>
            <name name-style="western">
              <surname>Jain</surname>
              <given-names>AK</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Pill-ID: matching and retrieval of drug pill images</article-title>
          <source>Pattern Recognit Letters</source>
          <year>2012</year>
          <month>05</month>
          <volume>33</volume>
          <issue>7</issue>
          <fpage>904</fpage>
          <lpage>10</lpage>
          <pub-id pub-id-type="doi">10.1016/j.patrec.2011.08.022</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref13">
        <label>13</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Kamata</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>A new accurate pill recognition system using imprint information</article-title>
          <source>Proceedings of the Sixth International Conference on Machine Vision (ICMV 2013)</source>
          <year>2013</year>
          <conf-name>Sixth International Conference on Machine Vision (ICMV 2013)</conf-name>
          <conf-date>Dec 24, 2013</conf-date>
          <conf-loc>London, United Kingdom</conf-loc>
          <pub-id pub-id-type="doi">10.1117/12.2051168</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref14">
        <label>14</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Yu</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Kamata</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Pill recognition using imprint information by two-step sampling distance sets</article-title>
          <source>Proceedings of the  22nd International Conference on Pattern Recognition</source>
          <year>2014</year>
          <conf-name>22nd International Conference on Pattern Recognition</conf-name>
          <conf-date>Aug 24-28, 2014</conf-date>
          <conf-loc>Stockholm, Sweden</conf-loc>
          <pub-id pub-id-type="doi">10.1109/icpr.2014.544</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref15">
        <label>15</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Yu</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Kamata</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Accurate system for automatic pill recognition using imprint information</article-title>
          <source>IET Image Processing</source>
          <year>2015</year>
          <month>12</month>
          <volume>9</volume>
          <issue>12</issue>
          <fpage>1039</fpage>
          <lpage>47</lpage>
          <pub-id pub-id-type="doi">10.1049/iet-ipr.2014.1007</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref16">
        <label>16</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wong</surname>
              <given-names>YF</given-names>
            </name>
            <name name-style="western">
              <surname>Ng</surname>
              <given-names>HT</given-names>
            </name>
            <name name-style="western">
              <surname>Leung</surname>
              <given-names>KY</given-names>
            </name>
            <name name-style="western">
              <surname>Chan</surname>
              <given-names>KY</given-names>
            </name>
            <name name-style="western">
              <surname>Chan</surname>
              <given-names>SY</given-names>
            </name>
            <name name-style="western">
              <surname>Loy</surname>
              <given-names>CC</given-names>
            </name>
          </person-group>
          <article-title>Development of fine-grained pill identification algorithm using deep convolutional network</article-title>
          <source>J Biomed Inform</source>
          <year>2017</year>
          <month>10</month>
          <volume>74</volume>
          <fpage>130</fpage>
          <lpage>6</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://linkinghub.elsevier.com/retrieve/pii/S1532-0464(17)30202-2"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.jbi.2017.09.005</pub-id>
          <pub-id pub-id-type="medline">28923366</pub-id>
          <pub-id pub-id-type="pii">S1532-0464(17)30202-2</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref17">
        <label>17</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>James</surname>
              <given-names>KL</given-names>
            </name>
            <name name-style="western">
              <surname>Barlow</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>McArtney</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Hiom</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Roberts</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Whittlesea</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>Incidence, type and causes of dispensing errors: a review of the literature</article-title>
          <source>Int J Pharm Pract</source>
          <year>2009</year>
          <month>02</month>
          <volume>17</volume>
          <issue>1</issue>
          <fpage>9</fpage>
          <lpage>30</lpage>
          <pub-id pub-id-type="medline">20218026</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref18">
        <label>18</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Flynn</surname>
              <given-names>EA</given-names>
            </name>
            <name name-style="western">
              <surname>Barker</surname>
              <given-names>KN</given-names>
            </name>
          </person-group>
          <article-title>Effect of an automated dispensing system on errors in two pharmacies</article-title>
          <source>J Am Pharm Assoc (2003)</source>
          <year>2006</year>
          <volume>46</volume>
          <issue>5</issue>
          <fpage>613</fpage>
          <lpage>5</lpage>
          <pub-id pub-id-type="doi">10.1331/1544-3191.46.5.613.flynn</pub-id>
          <pub-id pub-id-type="medline">17036648</pub-id>
          <pub-id pub-id-type="pii">S1544-3191(15)31480-1</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref19">
        <label>19</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Cho</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Park</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Ryu</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Sung</surname>
              <given-names>Y</given-names>
            </name>
          </person-group>
          <article-title>Usability evaluation of an image-based pill identification application</article-title>
          <source>J Rheum Dis</source>
          <year>2019</year>
          <volume>26</volume>
          <issue>2</issue>
          <fpage>111</fpage>
          <pub-id pub-id-type="doi">10.4078/jrd.2019.26.2.111</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref20">
        <label>20</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Yaniv</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Faruque</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Howe</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Dunn</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Sharlip</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Bond</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Perillan</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Bodenreider</surname>
              <given-names>O</given-names>
            </name>
            <name name-style="western">
              <surname>Ackerman</surname>
              <given-names>MJ</given-names>
            </name>
            <name name-style="western">
              <surname>Yoo</surname>
              <given-names>TS</given-names>
            </name>
          </person-group>
          <article-title>The national library of medicine pill image recognition challenge: an initial report</article-title>
          <source>IEEE Appl Imag Pattern Recognit Workshop</source>
          <year>2016</year>
          <month>10</month>
          <volume>2016</volume>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/29854569"/>
          </comment>
          <pub-id pub-id-type="doi">10.1109/AIPR.2016.8010584</pub-id>
          <pub-id pub-id-type="medline">29854569</pub-id>
          <pub-id pub-id-type="pmcid">PMC5973812</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref21">
        <label>21</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zeng</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Cao</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>MobileDeepPill: a small-footprint mobile deep learning system for recognizing unconstrained pill images</article-title>
          <source>Proceedings of the 15th Annual International Conference on Mobile Systems, Applications, and Services</source>
          <year>2017</year>
          <conf-name>MobiSys'17: The 15th Annual International Conference on Mobile Systems, Applications, and Services</conf-name>
          <conf-date>Jun 19 - 23, 2017</conf-date>
          <conf-loc>Niagara Falls New York USA</conf-loc>
          <pub-id pub-id-type="doi">10.1145/3081333.3081336</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref22">
        <label>22</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Larios Delgado</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Usuyama</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Hall</surname>
              <given-names>AK</given-names>
            </name>
            <name name-style="western">
              <surname>Hazen</surname>
              <given-names>RJ</given-names>
            </name>
            <name name-style="western">
              <surname>Ma</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Sahu</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Lundin</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Fast and accurate medication identification</article-title>
          <source>NPJ Digit Med</source>
          <year>2019</year>
          <month>2</month>
          <day>28</day>
          <volume>2</volume>
          <issue>1</issue>
          <fpage>10</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1038/s41746-019-0086-0"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/s41746-019-0086-0</pub-id>
          <pub-id pub-id-type="medline">31304359</pub-id>
          <pub-id pub-id-type="pii">86</pub-id>
          <pub-id pub-id-type="pmcid">PMC6550183</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref23">
        <label>23</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Chang</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Hsu</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Lin</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>MedGlasses: a wearable smart-glasses-based drug pill recognition system using deep learning for visually impaired chronic patients</article-title>
          <source>IEEE Access</source>
          <year>2020</year>
          <volume>8</volume>
          <fpage>17013</fpage>
          <lpage>24</lpage>
          <pub-id pub-id-type="doi">10.1109/access.2020.2967400</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref24">
        <label>24</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Szegedy</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Jia</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Sermanet</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Reed</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Anguelov</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Erhan</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Vanhoucke</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Rabinovich</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Going deeper with convolutions</article-title>
          <source>Proceedings of the 2015 IEEE Conference on Computer Vision and Pattern Recognition</source>
          <year>2015</year>
          <conf-name>CVPR '15</conf-name>
          <conf-date>June 7-12, 2015</conf-date>
          <conf-loc>Boston, MA, USA</conf-loc>
          <fpage>1</fpage>
          <lpage>9</lpage>
          <pub-id pub-id-type="doi">10.1109/CVPR.2015.7298594</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref25">
        <label>25</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Ribera</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Yarlagadda</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Zhu</surname>
              <given-names>F</given-names>
            </name>
          </person-group>
          <article-title>Pill recognition using minimal labeled data</article-title>
          <source>Proceedings of the IEEE 3rd International Conference on Multimedia Big Data</source>
          <year>2017</year>
          <conf-name>BigMM '17</conf-name>
          <conf-date>April 19-21, 2017</conf-date>
          <conf-loc>Laguna Hills, CA, USA</conf-loc>
          <fpage>346</fpage>
          <lpage>53</lpage>
          <pub-id pub-id-type="doi">10.1109/BigMM.2017.61</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref26">
        <label>26</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Krizhevsky</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Sutskever</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Hinton</surname>
              <given-names>GE</given-names>
            </name>
          </person-group>
          <article-title>ImageNet classification with deep convolutional neural networks</article-title>
          <source>Commun ACM</source>
          <year>2017</year>
          <month>05</month>
          <day>24</day>
          <volume>60</volume>
          <issue>6</issue>
          <fpage>84</fpage>
          <lpage>90</lpage>
          <pub-id pub-id-type="doi">10.1145/3065386</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref27">
        <label>27</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lester</surname>
              <given-names>CA</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Ding</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Rowell</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Kontar</surname>
              <given-names>RA</given-names>
            </name>
          </person-group>
          <article-title>Performance evaluation of a prescription medication image classification model: an observational cohort</article-title>
          <source>NPJ Digit Med</source>
          <year>2021</year>
          <month>07</month>
          <day>27</day>
          <volume>4</volume>
          <issue>1</issue>
          <fpage>118</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1038/s41746-021-00483-8"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/s41746-021-00483-8</pub-id>
          <pub-id pub-id-type="medline">34315995</pub-id>
          <pub-id pub-id-type="pii">10.1038/s41746-021-00483-8</pub-id>
          <pub-id pub-id-type="pmcid">PMC8316316</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref28">
        <label>28</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Redmon</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Divvala</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Girshick</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Farhadi</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>You only look once: unified, real-time object detection</article-title>
          <source>Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR)</source>
          <year>2016</year>
          <conf-name>IEEE Conference on Computer Vision and Pattern Recognition (CVPR)</conf-name>
          <conf-date>Jun 27-30, 2016</conf-date>
          <conf-loc>Las Vegas, NV, USA</conf-loc>
          <pub-id pub-id-type="doi">10.1109/cvpr.2016.91</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref29">
        <label>29</label>
        <nlm-citation citation-type="web">
          <article-title>ultralytics/yolov5: v3.1 - Bug Fixes and Performance Improvements</article-title>
          <source>Zenodo</source>
          <year>2020</year>
          <month>10</month>
          <day>29</day>
          <access-date>2022-12-19</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://zenodo.org/record/4154370#.Y4ms7XZBzIU">https://zenodo.org/record/4154370#.Y4ms7XZBzIU</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref30">
        <label>30</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>He</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Ren</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Sun</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Deep residual learning for image recognition</article-title>
          <source>Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR)</source>
          <year>2016</year>
          <conf-name>IEEE Conference on Computer Vision and Pattern Recognition (CVPR)</conf-name>
          <conf-date>Jun 27-30, 2016</conf-date>
          <conf-loc>Las Vegas, NV, USA</conf-loc>
          <pub-id pub-id-type="doi">10.1109/cvpr.2016.90</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref31">
        <label>31</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Rumelhart</surname>
              <given-names>DE</given-names>
            </name>
            <name name-style="western">
              <surname>Hinton</surname>
              <given-names>GE</given-names>
            </name>
            <name name-style="western">
              <surname>Williams</surname>
              <given-names>RJ</given-names>
            </name>
          </person-group>
          <article-title>Learning representations by back-propagating errors</article-title>
          <source>Nature</source>
          <year>1986</year>
          <month>10</month>
          <day>9</day>
          <volume>323</volume>
          <issue>6088</issue>
          <fpage>533</fpage>
          <lpage>6</lpage>
          <pub-id pub-id-type="doi">10.1038/323533a0</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref32">
        <label>32</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Sutskever</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Vinyals</surname>
              <given-names>O</given-names>
            </name>
            <name name-style="western">
              <surname>Le</surname>
              <given-names>Q</given-names>
            </name>
          </person-group>
          <article-title>Sequence to sequence learning with neural networks</article-title>
          <source>arXiv</source>
          <year>2014</year>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/1409.3215"/>
          </comment>
          <pub-id pub-id-type="doi">10.5555/2969033.2969173</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref33">
        <label>33</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Cho</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>van Merriënboer</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Gulcehre</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Bahdanau</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Bougares</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Schwenk</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Bengio</surname>
              <given-names>Y</given-names>
            </name>
          </person-group>
          <article-title>Learning phrase representations using RNN encoder–decoder for statistical machine translation</article-title>
          <source>Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing (EMNLP)</source>
          <year>2014</year>
          <conf-name>2014 Conference on Empirical Methods in Natural Language Processing (EMNLP)</conf-name>
          <conf-date>Oct, 2014</conf-date>
          <conf-loc>Doha, Qatar</conf-loc>
          <pub-id pub-id-type="doi">10.3115/v1/d14-1179</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref34">
        <label>34</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Bahdanau</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Cho</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Bengio</surname>
              <given-names>Y</given-names>
            </name>
          </person-group>
          <article-title>Neural machine translation by jointly learning to align and translate</article-title>
          <source>Proceedings of the 3rd International Conference on Learning Representations (ICLR)</source>
          <year>2015</year>
          <conf-name>3rd International Conference on Learning Representations (ICLR)</conf-name>
          <conf-date>May 7-9, 2015</conf-date>
          <conf-loc>San Diego, CA, USA</conf-loc>
          <pub-id pub-id-type="doi">10.48550/arXiv.1409.0473</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref35">
        <label>35</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Luong</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Pham</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Manning</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>Effective approaches to attention-based neural machine translation</article-title>
          <source>Proceedings of the 2015 Conference on Empirical Methods in Natural Language Processing</source>
          <year>2015</year>
          <conf-name>2015 Conference on Empirical Methods in Natural Language Processing</conf-name>
          <conf-date>Sep 17-21, 2015</conf-date>
          <conf-loc>Lisbon, Portugal</conf-loc>
          <pub-id pub-id-type="doi">10.18653/v1/d15-1166</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref36">
        <label>36</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Case</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Suresh</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Coates</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Ng</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Autonomous sign reading for semantic mapping</article-title>
          <source>Proceedings of the IEEE International Conference on Robotics and Automation</source>
          <year>2011</year>
          <conf-name>IEEE International Conference on Robotics and Automation</conf-name>
          <conf-date>May 09-13, 2011</conf-date>
          <conf-loc>Shanghai, China</conf-loc>
          <pub-id pub-id-type="doi">10.1109/icra.2011.5980523</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref37">
        <label>37</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ham</surname>
              <given-names>YK</given-names>
            </name>
          </person-group>
          <article-title>Recognition of raised characters for automatic classification of rubber tires</article-title>
          <source>Optical Eng</source>
          <year>1995</year>
          <month>01</month>
          <day>01</day>
          <volume>34</volume>
          <issue>1</issue>
          <fpage>102</fpage>
          <pub-id pub-id-type="doi">10.1117/12.184094</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref38">
        <label>38</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Chandrasekhar</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Tasi</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Cheung</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Takacs</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Reznik</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Vedantham</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Grzeszczuk</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Bach</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Girod</surname>
              <given-names>B</given-names>
            </name>
          </person-group>
          <article-title>The stanford mobile visual search data set</article-title>
          <source>Proceedings of the second annual ACM conference on Multimedia systems</source>
          <year>2011</year>
          <conf-name>MMSYS '11: MMSYS '11 - Multimedia Systems Conference</conf-name>
          <conf-date>Feb 23 - 25, 2011</conf-date>
          <conf-loc>San Jose CA USA</conf-loc>
          <pub-id pub-id-type="doi">10.1145/1943552.1943568</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref39">
        <label>39</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Deng</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Dong</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Socher</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Fei-Fei</surname>
              <given-names>L</given-names>
            </name>
          </person-group>
          <article-title>ImageNet: a large-scale hierarchical image database</article-title>
          <source>Proceedings of the 2009 IEEE Conference on Computer Vision and Pattern Recognition</source>
          <year>2009</year>
          <conf-name>2009 IEEE Conference on Computer Vision and Pattern Recognition</conf-name>
          <conf-date>Jun 20-25, 2009</conf-date>
          <conf-loc>Miami, FL, USA</conf-loc>
          <pub-id pub-id-type="doi">10.1109/cvpr.2009.5206848</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref40">
        <label>40</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Russakovsky</surname>
              <given-names>O</given-names>
            </name>
            <name name-style="western">
              <surname>Deng</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Su</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Krause</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Satheesh</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Ma</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Huang</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Karpathy</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Khosla</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Bernstein</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Berg</surname>
              <given-names>AC</given-names>
            </name>
            <name name-style="western">
              <surname>Fei-Fei</surname>
              <given-names>L</given-names>
            </name>
          </person-group>
          <article-title>ImageNet large scale visual recognition challenge</article-title>
          <source>Int J Comput Vis</source>
          <year>2015</year>
          <month>4</month>
          <day>11</day>
          <volume>115</volume>
          <issue>3</issue>
          <fpage>211</fpage>
          <lpage>52</lpage>
          <pub-id pub-id-type="doi">10.1007/s11263-015-0816-y</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref41">
        <label>41</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lin</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Maire</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Belongie</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Hays</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Perona</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Ramanan</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Zitnick</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Dollár</surname>
              <given-names>P</given-names>
            </name>
          </person-group>
          <article-title>Microsoft COCO: common objects in context</article-title>
          <source>arXiv</source>
          <year>2014</year>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/1405.0312"/>
          </comment>
          <pub-id pub-id-type="doi">10.1007/978-3-319-10602-1_48</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref42">
        <label>42</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Cho</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>van Merriënboer</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Bahdanau</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Bengio</surname>
              <given-names>Y</given-names>
            </name>
          </person-group>
          <article-title>On the properties of neural machine translation: encoder–decoder approaches</article-title>
          <source>Proceedings of SSST-8, Eighth Workshop on Syntax, Semantics and Structure in Statistical Translation</source>
          <year>2014</year>
          <conf-name>SSST-8, Eighth Workshop on Syntax, Semantics and Structure in Statistical Translation</conf-name>
          <conf-date>Oct 25, 2014</conf-date>
          <conf-loc>Doha, Qatar</conf-loc>
          <pub-id pub-id-type="doi">10.3115/v1/w14-4012</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref43">
        <label>43</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Levenshtein</surname>
              <given-names>VI</given-names>
            </name>
          </person-group>
          <article-title>Binary codes capable of correcting deletions, insertions, and reversals</article-title>
          <source>Soviet Physics Doklady</source>
          <year>1966</year>
          <month>2</month>
          <volume>10</volume>
          <issue>8</issue>
          <fpage>707</fpage>
          <lpage>10</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref44">
        <label>44</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Navarro</surname>
              <given-names>G</given-names>
            </name>
          </person-group>
          <article-title>A guided tour to approximate string matching</article-title>
          <source>ACM Comput Surv</source>
          <year>2001</year>
          <month>03</month>
          <volume>33</volume>
          <issue>1</issue>
          <fpage>31</fpage>
          <lpage>88</lpage>
          <pub-id pub-id-type="doi">10.1145/375360.375365</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref45">
        <label>45</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Caruana</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>Multitask learning</article-title>
          <source>Learning to Learn</source>
          <year>1998</year>
          <publisher-loc>Boston, MA</publisher-loc>
          <publisher-name>Springer</publisher-name>
        </nlm-citation>
      </ref>
      <ref id="ref46">
        <label>46</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Dai</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Xiao</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Yuan</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>L</given-names>
            </name>
          </person-group>
          <article-title>Dynamic head: unifying object detection heads with attentions</article-title>
          <source>Proceedings of the 2021 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)</source>
          <year>2021</year>
          <conf-name>2021 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)</conf-name>
          <conf-date>Jun 20-25, 2021</conf-date>
          <conf-loc>Nashville, TN, USA</conf-loc>
          <pub-id pub-id-type="doi">10.1109/cvpr46437.2021.00729</pub-id>
        </nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
