<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<?covid-19-tdm?>
<article xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article" dtd-version="2.0">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JMIR</journal-id>
      <journal-id journal-id-type="nlm-ta">J Med Internet Res</journal-id>
      <journal-title>Journal of Medical Internet Research</journal-title>
      <issn pub-type="epub">1438-8871</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
      <article-id pub-id-type="publisher-id">v24i7e38584</article-id>
      <article-id pub-id-type="pmid">35658098</article-id>
      <article-id pub-id-type="doi">10.2196/38584</article-id>
      <article-categories>
        <subj-group subj-group-type="heading">
          <subject>Original Paper</subject>
        </subj-group>
        <subj-group subj-group-type="article-type">
          <subject>Original Paper</subject>
        </subj-group>
      </article-categories>
      <title-group>
        <article-title>Deep Denoising of Raw Biomedical Knowledge Graph From COVID-19 Literature, LitCovid, and Pubtator: Framework Development and Validation</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="editor">
          <name>
            <surname>Basch</surname>
            <given-names>Corey</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Li</surname>
            <given-names>Xin</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Pavliuk</surname>
            <given-names>Olena</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib id="contrib1" contrib-type="author">
          <name name-style="western">
            <surname>Jiang</surname>
            <given-names>Chao</given-names>
          </name>
          <degrees>BS</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-0467-6177</ext-link>
        </contrib>
        <contrib id="contrib2" contrib-type="author">
          <name name-style="western">
            <surname>Ngo</surname>
            <given-names>Victoria</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff2" ref-type="aff">2</xref>
          <xref rid="aff3" ref-type="aff">3</xref>
          <xref rid="aff4" ref-type="aff">4</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-9973-8379</ext-link>
        </contrib>
        <contrib id="contrib3" contrib-type="author">
          <name name-style="western">
            <surname>Chapman</surname>
            <given-names>Richard</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-3600-0286</ext-link>
        </contrib>
        <contrib id="contrib4" contrib-type="author">
          <name name-style="western">
            <surname>Yu</surname>
            <given-names>Yue</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff5" ref-type="aff">5</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-3900-1217</ext-link>
        </contrib>
        <contrib id="contrib5" contrib-type="author">
          <name name-style="western">
            <surname>Liu</surname>
            <given-names>Hongfang</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff5" ref-type="aff">5</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-2570-3741</ext-link>
        </contrib>
        <contrib id="contrib6" contrib-type="author">
          <name name-style="western">
            <surname>Jiang</surname>
            <given-names>Guoqian</given-names>
          </name>
          <degrees>MD, PhD</degrees>
          <xref rid="aff5" ref-type="aff">5</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-2940-0019</ext-link>
        </contrib>
        <contrib id="contrib7" contrib-type="author" corresp="yes">
          <name name-style="western">
            <surname>Zong</surname>
            <given-names>Nansu</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff5" ref-type="aff">5</xref>
          <address>
            <institution>Department of Artificial Intelligence and Informatics Research</institution>
            <institution>Mayo Clinic</institution>
            <addr-line>200 First St SW</addr-line>
            <addr-line>Rochester, MN, 55905</addr-line>
            <country>United States</country>
            <phone>1 507 284 2511</phone>
            <email>Zong.Nansu@mayo.edu</email>
          </address>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-0066-9524</ext-link>
        </contrib>
      </contrib-group>
      <aff id="aff1">
        <label>1</label>
        <institution>Department of Computer Science and Software Engineering</institution>
        <institution>Auburn University</institution>
        <addr-line>Auburn, AL</addr-line>
        <country>United States</country>
      </aff>
      <aff id="aff2">
        <label>2</label>
        <institution>Center for Innovation to Implementation</institution>
        <institution>VA Palo Alto Health Care System</institution>
        <addr-line>Sacramento, CA</addr-line>
        <country>United States</country>
      </aff>
      <aff id="aff3">
        <label>3</label>
        <institution>Stanford Health Policy</institution>
        <institution>Stanford School of Medicine</institution>
        <institution>Stanford University</institution>
        <addr-line>Stanford, CA</addr-line>
        <country>United States</country>
      </aff>
      <aff id="aff4">
        <label>4</label>
        <institution>Freeman Spogli Institute for International Studies</institution>
        <institution>Stanford University</institution>
        <addr-line>Stanford, CA</addr-line>
        <country>United States</country>
      </aff>
      <aff id="aff5">
        <label>5</label>
        <institution>Department of Artificial Intelligence and Informatics Research</institution>
        <institution>Mayo Clinic</institution>
        <addr-line>Rochester, MN</addr-line>
        <country>United States</country>
      </aff>
      <author-notes>
        <corresp>Corresponding Author: Nansu Zong <email>Zong.Nansu@mayo.edu</email></corresp>
      </author-notes>
      <pub-date pub-type="collection">
        <month>7</month>
        <year>2022</year>
      </pub-date>
      <pub-date pub-type="epub">
        <day>6</day>
        <month>7</month>
        <year>2022</year>
      </pub-date>
      <volume>24</volume>
      <issue>7</issue>
      <elocation-id>e38584</elocation-id>
      <history>
        <date date-type="received">
          <day>7</day>
          <month>4</month>
          <year>2022</year>
        </date>
        <date date-type="rev-request">
          <day>1</day>
          <month>5</month>
          <year>2022</year>
        </date>
        <date date-type="rev-recd">
          <day>20</day>
          <month>5</month>
          <year>2022</year>
        </date>
        <date date-type="accepted">
          <day>30</day>
          <month>5</month>
          <year>2022</year>
        </date>
      </history>
      <copyright-statement>©Chao Jiang, Victoria Ngo, Richard Chapman, Yue Yu, Hongfang Liu, Guoqian Jiang, Nansu Zong. Originally published in the Journal of Medical Internet Research (https://www.jmir.org), 06.07.2022.</copyright-statement>
      <copyright-year>2022</copyright-year>
      <license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
        <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (https://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in the Journal of Medical Internet Research, is properly cited. The complete bibliographic information, a link to the original publication on https://www.jmir.org/, as well as this copyright and license information must be included.</p>
      </license>
      <self-uri xlink:href="https://www.jmir.org/2022/7/e38584" xlink:type="simple"/>
      <abstract>
        <sec sec-type="background">
          <title>Background</title>
          <p>Multiple types of biomedical associations of knowledge graphs, including COVID-19–related ones, are constructed based on co-occurring biomedical entities retrieved from recent literature. However, the applications derived from these raw graphs (eg, association predictions among genes, drugs, and diseases) have a high probability of false-positive predictions as co-occurrences in the literature do not always mean there is a true biomedical association between two entities.</p>
        </sec>
        <sec sec-type="objective">
          <title>Objective</title>
          <p>Data quality plays an important role in training deep neural network models; however, most of the current work in this area has been focused on improving a model’s performance with the assumption that the preprocessed data are clean. Here, we studied how to remove noise from raw knowledge graphs with limited labeled information.</p>
        </sec>
        <sec sec-type="methods">
          <title>Methods</title>
          <p>The proposed framework used generative-based deep neural networks to generate a graph that can distinguish the unknown associations in the raw training graph. Two generative adversarial network models, NetGAN and Cross-Entropy Low-rank Logits (CELL), were adopted for the edge classification (ie, link prediction), leveraging unlabeled link information based on a real knowledge graph built from LitCovid and Pubtator.</p>
        </sec>
        <sec sec-type="results">
          <title>Results</title>
          <p>The performance of link prediction, especially in the extreme case of training data versus test data at a ratio of 1:9, demonstrated that the proposed method still achieved favorable results (area under the receiver operating characteristic curve &#62;0.8 for the synthetic data set and 0.7 for the real data set), despite the limited amount of testing data available.</p>
        </sec>
        <sec sec-type="conclusions">
          <title>Conclusions</title>
          <p>Our preliminary findings showed the proposed framework achieved promising results for removing noise during data preprocessing of the biomedical knowledge graph, potentially improving the performance of downstream applications by providing cleaner data.</p>
        </sec>
      </abstract>
      <kwd-group>
        <kwd>adversarial generative network</kwd>
        <kwd>knowledge graph</kwd>
        <kwd>deep denoising</kwd>
        <kwd>machine learning</kwd>
        <kwd>COVID-19</kwd>
        <kwd>biomedical</kwd>
        <kwd>neural network</kwd>
        <kwd>network model</kwd>
        <kwd>training data</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <sec sec-type="introduction">
      <title>Introduction</title>
      <p>The effects of the COVID-19 pandemic linger in 2022—it affected over 11.6 million people globally in the past year, and accounted for &#62;2.5 million deaths in more than 220 countries [<xref ref-type="bibr" rid="ref1">1</xref>]. With the continuous accumulation of peer-reviewed publications on the topic, a literature hub serves as a means to track the most up-to-date scientific information about the virus [<xref ref-type="bibr" rid="ref2">2</xref>]—encompassing research on the treatment, diagnosis, and prevention of COVID-19. A knowledge base built upon the integration of biomedical entities from such a literature hub would provide tremendous value in the exploration of explicit or implicit associations among diverse biomedical entities as investigators attempt to answer clinical questions related to COVID-19. A number of recently published journal articles have included graph-based analysis of COVID-19 data sets [<xref ref-type="bibr" rid="ref3">3</xref>]. For example, Groza [<xref ref-type="bibr" rid="ref4">4</xref>] analyzed how a semantically annotated data set would be helpful in detecting and preventing potentially harmful misinformation regarding the spread of COVID-19 based on CORD-19-on-FHIR (a linked data version of the COVID-19 Open Research Dataset [CORD-19] data represented in FHIR RDF by mining the CORD-19 data set and adding semantic annotations) [<xref ref-type="bibr" rid="ref5">5</xref>].</p>
      <p>Most knowledge graphs constructed for COVID-19 are currently based on the co-occurring biomedical entities reported in recent literature. A knowledge graph of co-occurring concepts, such as the one created by Oniani et al [<xref ref-type="bibr" rid="ref6">6</xref>], can help researchers find associations among genes, drugs, and diseases related to COVID-19. Using knowledge graphs with heterogeneous biomedical associations (eg, gene-drug, disease-drug, drug–side effect) in these types of applications, however, results in a high probability of false-positive predictions because co-occurrence in literature does not always mean there is a true biomedical association between the two entities. These co-occurrence edges are therefore considered “noise” due to their untrue associations. For example, the term “glucose” may co-occur with the term “yellow fever,” but there is no real medical association between the two terms. Noise removal can be beneficial for downstream applications, such as link prediction [<xref ref-type="bibr" rid="ref7">7</xref>], representation learning [<xref ref-type="bibr" rid="ref8">8</xref>,<xref ref-type="bibr" rid="ref9">9</xref>], and node classification [<xref ref-type="bibr" rid="ref10">10</xref>].</p>
      <p>The manual processes of cleaning data and removing noise are resource intensive. Therefore, an automated denoising method is ideal in facilitating the curation of knowledge graphs. Existing methods for denoising knowledge graphs can be divided into two groups: internal and external [<xref ref-type="bibr" rid="ref11">11</xref>]. For the internal method, the predefined semantics or rules [<xref ref-type="bibr" rid="ref12">12</xref>] are used for nonnumerical data. Outlier detection [<xref ref-type="bibr" rid="ref13">13</xref>] removes noise by modeling true data as a distribution for numerical data. As for external methods, a pretrained graph neural network integrates heterogeneous data sources [<xref ref-type="bibr" rid="ref14">14</xref>] to not only improve the performance of link prediction but also reduce the training time of the existing graph neural network model. In this paper, our methodology can be categorized as an internal method where data augmentation with a generative adversarial network (GAN) removes noise. GAN has been widely applied in medical imaging process [<xref ref-type="bibr" rid="ref15">15</xref>] to denoise computed tomography images based on GAN with Wasserstein distance and perceptual similarity. Zhou et al [<xref ref-type="bibr" rid="ref16">16</xref>] previously showed improvement of ultrasonic image quality and noise reduction caused device limitations through the construction of a two-stage GAN. Other than the application of generating images, GAN has mainly been used for generating discrete medical data to contribute to the scenario of diagnosis of a disease with few labels [<xref ref-type="bibr" rid="ref17">17</xref>] or unbalanced classification [<xref ref-type="bibr" rid="ref18">18</xref>]. To the best knowledge of the authors, our study is the first study that uses GAN to denoise a biomedical knowledge graph.</p>
      <p>Here, we propose a framework that generates a similar graph from a raw knowledge graph to distinguish the true and false edges of association based on generative-based deep neural networks. Two recent generative-based models, Cross-Entropy Low-rank Logits (CELL) [<xref ref-type="bibr" rid="ref19">19</xref>] and a generative-based graph method (NetGAN) [<xref ref-type="bibr" rid="ref20">20</xref>], have been adopted as a component to remove noise and retain true associations within two data sets: (1) a synthetic data set generated from CORA-ML [<xref ref-type="bibr" rid="ref21">21</xref>] with the same preprocessing as in NetGAN [<xref ref-type="bibr" rid="ref20">20</xref>]; and (2) a real data set constructed from CORD-19-on-FHIR data sets with heterogeneous biomedical associations (ie, chemical-disease, gene-disease, gene-chemical associations) [<xref ref-type="bibr" rid="ref5">5</xref>]. Our study shows the proposed method achieved promising results in the classification tasks for separating the true and negative edges.</p>
    </sec>
    <sec sec-type="methods">
      <title>Methods</title>
      <sec>
        <title>Problem Definition</title>
        <p>Given a network G(V, E), where <italic>V</italic> stands for a set of vertices (ie, biomedical concepts in the literature) and <italic>E</italic> represents the edges among two vertices (ie, the co-occurrence of two concepts), two kinds of edges exist, which are denoted as <italic>L</italic> (known true associations) and <italic>U</italic> (unknown true associations). We note that if no edge exists between two vertices, this will be considered a false association. The aim is to find the true associations among <italic>U</italic> (ie, denoise <italic>U</italic>). Specifically, a proposed method should have the capacity to determine whether unknown true associations from <italic>U</italic> are true associations or false.</p>
      </sec>
      <sec>
        <title>Framework</title>
        <sec>
          <title>Overview</title>
          <p>As this problem could be considered a classification of an unknown edge with a small number of known true associations and a large number of unknown true associations, we defined this classification problem as few-shot learning [<xref ref-type="bibr" rid="ref22">22</xref>]. We proposed a framework that used generative-based deep neural networks (eg, NetGAN and CELL) to denoise the unknown true associations in <italic>U</italic> based on similar networks generated. This framework was divided into 3 parts. We first briefly describe the GAN-based denoising graph adopted following the development of the framework, followed by an introduction of data preparation, which involved two strategies: (1) synthetic data generation and (2) real data set collection and annotation. A comprehensive design of our experiments was then conducted to verify our assumptions.</p>
        </sec>
        <sec>
          <title>Denoising Based on Generative-Based Deep Neural Networks</title>
          <p>We adopted NetGAN to generate a new network that would be used to distinguish the unknown associations in the raw training data (ie, graph). To achieve this, we randomly sampled walks from the raw graph consisting of unlabeled edges and trained a generator to learn the walks sampled and a discriminator on how to separate a real walk from a fake one. After achieving equilibrium among the discriminator and generator, the random walk sample from the generator was used for filtering the unreal edge in the raw graph. As determined in previous work by other researchers [<xref ref-type="bibr" rid="ref19">19</xref>], sampling enough random walks was sufficient to reconstruct the graph. Both the generator and discriminator used the long short-term memory (LSTM) architecture [<xref ref-type="bibr" rid="ref23">23</xref>] and were trained with the Wasserstein loss [<xref ref-type="bibr" rid="ref24">24</xref>]. The generator <italic>G</italic> generated large numbers of random walks (node sequence) of fixed length. The discriminator <italic>D</italic> distinguished the sequence of the nodes sampled from <italic>G</italic> and <italic>x</italic> that were sampled from the real graph (including unlabeled associations) with randomly started nodes. <italic>D</italic> and <italic>G</italic> played the following minimax game with the value function <italic>V(D, G)</italic>:</p>
          <disp-formula>
            <graphic xlink:href="jmir_v24i7e38584_fig8.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </disp-formula>
          <p>Finally, the <italic>D</italic> generated a similar authentic graph network that could not be distinguished by the discriminator <italic>G</italic>.</p>
          <p>To generate the probability of the edges, CELL approximated it with a score matrix <italic>S</italic>, which was computed by <inline-graphic xlink:href="jmir_v24i7e38584_fig9.png" xlink:type="simple" mimetype="image"/> where <italic>n</italic> is the number of random walks, <italic>T</italic> is each length of a random walk, and diag(π) is the stationary distribution matrix. <italic>P</italic> is a transition matrix that approximates the unbiased random walk used in NetGAN. <italic>P</italic> can be low-rank approximated by <italic>W</italic>, which is the logit transition matrix and is solved by the objective function as:</p>
          <disp-formula>
            <graphic xlink:href="jmir_v24i7e38584_fig10.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </disp-formula>
          <p>where <italic>A</italic> is the adjacency matrix and s.t. rank(W) ≤ <italic>H</italic>. In practice, we further adapted node2vec [<xref ref-type="bibr" rid="ref25">25</xref>] for the random sampling process in NetGAN and constrained the edge generation length with <italic>k</italic> in the above loss function in CELL.</p>
        </sec>
        <sec>
          <title>Data Preparation</title>
          <sec>
            <title>Overview</title>
            <p>We generated two data sets for this study: (1) a synthetic data set based on CORA-ML, and (2) a real data set extracted from CORD-19-on-FHIR data sets [<xref ref-type="bibr" rid="ref5">5</xref>]. First, we defined two types of associations: labeled associations denoted as (<italic>L</italic>) (red colored) and unlabeled associations represented as (<italic>U</italic>) (green colored) (<xref rid="figure1" ref-type="fig">Figure 1</xref>), based on two types of association. This was used to construct our training and test graphs. The training graph consisted of both the labeled (<italic>L</italic>) and unlabeled (<italic>U</italic>) associations, while there were only labeled (<italic>L</italic>) associations in the test graph, as we need the ground truth for evaluating the performance of our proposed methods. The histogram of each data set is given below, where <xref rid="figure2" ref-type="fig">Figure 2</xref>A is the synthetic data set. This does not include the false associations added in our subsequent experiments. <xref rid="figure2" ref-type="fig">Figure 2</xref>B shows the histogram of degree distribution in the real data set.</p>
            <fig id="figure1" position="float">
              <label>Figure 1</label>
              <caption>
                <p>Overview of our investigation process. GAN: generative adversarial network; ROC-AUC: area under the receiver operating characteristic curve.</p>
              </caption>
              <graphic xlink:href="jmir_v24i7e38584_fig1.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
            </fig>
            <fig id="figure2" position="float">
              <label>Figure 2</label>
              <caption>
                <p>Histogram of degree distribution in the synthetic data set and real data set.</p>
              </caption>
              <graphic xlink:href="jmir_v24i7e38584_fig2.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
            </fig>
          </sec>
          <sec>
            <title>Synthetic Data Set</title>
            <p>The synthetic data set was generated based on CORA-ML with the same preprocessing work as NetGAN [<xref ref-type="bibr" rid="ref20">20</xref>]; we chose the largest connected component in the graph. The final total number of nodes and edges is shown in the top right corner of <xref rid="figure2" ref-type="fig">Figure 2</xref>A. To test our proposed methods, we took as ground truth the existing edges as true associations and the nonexistence edges as false associations. Detailed synthetic data processes can be found in Figure S1 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p>
          </sec>
          <sec>
            <title>Real Data Set</title>
            <p>From CORD-19-on-FHIR [<xref ref-type="bibr" rid="ref5">5</xref>], we used two annotated networks (LitCovid [<xref ref-type="bibr" rid="ref26">26</xref>], Pubtator [<xref ref-type="bibr" rid="ref27">27</xref>]), extracting the COVID-19–related terms in our SPARQL query with 3 types of biomedical concepts (ie, gene, chemical, mutation/disease). After merging identical IDs from both LitCovid and Pubtator, we were able to obtain a new data set with a total of 23,578 nodes (<xref rid="figure2" ref-type="fig">Figure 2</xref>B). Finally, we randomly chose a proportional number of edges with a total of 500 associations (ie, chemical-disease, gene-disease, gene-chemical associations) from a total of 288,270 edges in the whole graph and manually labeled them as our labeled data set. Detailed data preprocessing and degree distribution for each type of association can be found in Supplementary 2 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p>
          </sec>
        </sec>
        <sec>
          <title>Experiment Design</title>
          <sec>
            <title>Overview</title>
            <p>We conducted experiments on both a synthetic data set (ie, CORA-ML) and a real data set extracted from CORD-19-on-FHIR to investigate the capability of our proposed methods of incorporating unlabeled information for improving the link prediction performance despite limited annotation. We analyzed the performance of our models with multiple tasks based on two types of ratios to mimic the percentages of noise and annotation during the data curation: (1) noise ratio (NR), the percentage of true and false associations in the unlabeled edges; and (2) annotation ratio (AR), the percentage of training and testing associations in the labeled edges.</p>
          </sec>
          <sec>
            <title>Task 1: Test of AR Over the Synthetic Data Set</title>
            <p>We wanted to understand how many annotations were needed during the data curation for our proposed method to predict the true and false associations. We set a fixed NR and evaluated the performance of the tested method in two cases: one included the unlabeled data (ie, training set = labeled true and false associations + unlabeled associations), and the other did not include the unlabeled data (ie, training set = labeled true and false associations). The unlabeled associations were taken as true associations for training. In the experiment, we tested the performances based on different AR to mimic the percentage of the annotations already completed during the data curation. In practice, the AR varied from 1:9 to 9:1. For each ratio, we repeated the test 10 times with a random sampling of the training and testing sets to get the average results.</p>
          </sec>
          <sec>
            <title>Task 2: Test of NR Over the Synthetic Data Set</title>
            <p>In this task, we wanted to understand how many false associations were deemed as true associations in unlabeled data for training because it affected the prediction performance of the proposed method. We wanted to see whether the proposed method was robust enough to learn useful information for prediction, especially from unlabeled edges with more noise. With a fixed AR of 1:1, we tested the proposed method when there were more false edges than real edges in the unlabeled data. In practice, the NR varied from 1:1 to 1:9.</p>
          </sec>
          <sec>
            <title>Task 3: Test Over the Real Data Set</title>
            <p>After the same training of annotation, two of the authors (CJ and YY) manually labeled 500 of the 288,270 edges to simulate an extreme use case for data curation, and another author (VN) verified the annotation by random sampling the edges. Among the 500 edges, the 3 types consisted of chemical-disease, gene-chemical, and gene-disease. Each edge was marked as true, false, and unknown. In practice, the annotations for gene-chemical were excluded and marked as unknown in the final evaluation after the authors had a discussion and reached a consensus that those annotations were conducted without enough confidence. Thus, in our final result report of the receiver operating characteristic curve, we only considered 2 types of associations: chemical-disease and gene-disease.</p>
          </sec>
        </sec>
        <sec>
          <title>Setting and Evaluation Metrics</title>
          <p>For each proposed method (ie, NetGAN and CELL), a grid search strategy was adapted for obtaining the best hyperparameters. In our experiment, we defined the search range by referencing the original settings in the articles. For NetGAN, the parameter ranges for the grid search are specified as walk <italic>p</italic> = {0.01, 0.1, 1, 10, 100} and <italic>q</italic> = {0.01, 0.1, 1, 10, 100}. For CELL, the parameter ranges are specified as rank <italic>H</italic> = {9, 20}, learning rate <italic>lr</italic> = {0.01, 0.05, 0.1}, and weight decay <italic>weight<sub>decay</sub></italic> = {1<italic>e</italic> – 5, 1<italic>e</italic> – 6, 1<italic>e</italic> – 7}. In practice, the origin NetGAN was obtained from [<xref ref-type="bibr" rid="ref28">28</xref>], and the origin CELL was obtained from [<xref ref-type="bibr" rid="ref29">29</xref>].</p>
          <p>In the evaluation step, we chose the area under the receiver operating characteristic curve (AUC ROC) and average precision (AP) as the metrics of link prediction for our proposed methods in both synthetic and real data sets. In the implementation, both AUC ROC and AP scores were calculated by scikit-learn [<xref ref-type="bibr" rid="ref30">30</xref>]. The visualization of predicted results in our real data set was a plot made with Cytoscape [<xref ref-type="bibr" rid="ref31">31</xref>], an open-source software platform for visualizing complex networks and integrating these with any type of attribute data.</p>
        </sec>
      </sec>
    </sec>
    <sec sec-type="results">
      <title>Results</title>
      <sec>
        <title>Task Evaluation Outcomes</title>
        <sec>
          <title>Task 1: Comparison of the Link Prediction Results in a Graph With/Without the Unlabeled Associations Among Different ARs</title>
          <p>We conducted our experiments in two scenarios. One was the base case (dashed line in <xref rid="figure3" ref-type="fig">Figure 3</xref>) where we tested our models without using the unlabeled information, without explicitly stating it as the base case; all of our statements in the following section would be the default case (solid line in <xref rid="figure3" ref-type="fig">Figure 3</xref>) that indicated that we included the unlabeled associations in the link prediction tasks. We reported the AUC ROC score in <xref rid="figure3" ref-type="fig">Figure 3</xref>. Here, the left subfigure displayed the AUC ROC curve with a fixed AR of 0.5. The dashed line named “Base NetGAN” indicates the method of NetGAN that did not incorporate the unlabeled information. “Base CELL” is the method that CELL runs in the base case. There was little difference between the two methods when considering the base case with an AUC ROC score of 0.597 for NetGAN and 0.591 for CELL. However, when unlabeled information was taken into consideration, both methods achieved better performance compared to the base case (the AUC ROC score of NetGAN was 0.724, while CELL achieved a score of 0.828). The right side of <xref rid="figure3" ref-type="fig">Figure 3</xref> shows the performance of the proposed methods in different ARs ranging from 0.1-0.9. We determined that CELL had overall better performance.</p>
          <fig id="figure3" position="float">
            <label>Figure 3</label>
            <caption>
              <p>AUC ROC performance of NetGAN and CELL with/without unlabeled information. AR: annotation ratio; AUC ROC: area under the receiver operating characteristic curve; CELL: Cross-Entropy Low-rank Logits.</p>
            </caption>
            <graphic xlink:href="jmir_v24i7e38584_fig3.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </fig>
        </sec>
        <sec>
          <title>Task 2: How Do the Models Perform With Different NRs in the Unlabeled Edges?</title>
          <p>We tested the performance of methods in which the unlabeled information contained a different ratio of noise 10 times (<xref rid="figure4" ref-type="fig">Figure 4</xref>). CELL demonstrated exceptional performance when the NR was 1:1. Even in the extreme case where the true versus false ratio reached 1:9, CELL still had better performance compared to NetGAN with an area under the curve (AUC) score of around 0.7. CELL had less variance in performance compared with NetGAN at all NRs. In other words, CELL had a relatively better capability and stability to use unlabeled data compared with NetGAN when dealing with the complexity of the NR in unknown information.</p>
          <fig id="figure4" position="float">
            <label>Figure 4</label>
            <caption>
              <p>Performance in terms of AUC score at different noise ratios. AUC: area under the curve; AUC ROC: area under the receiver operating characteristic curve; CELL: Cross-Entropy Low-rank Logits.</p>
            </caption>
            <graphic xlink:href="jmir_v24i7e38584_fig4.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </fig>
        </sec>
        <sec>
          <title>Task 3: The Performance of Proposed Models in Our Collected Real Data Set</title>
          <p>After our exploration of our methods in task 2, we conducted our methods on a real data set. Although the NR was unknown in our real data set, the proposed methods still performed better than random classification with the incorporation of unknown associations. In addition, compared with NetGAN, CELL still had an impressive result with an AUC ROC of up to 0.706 when the test and train ratio was 1:1 and the unknown association occupied about 99.95% as shown in <xref rid="figure5" ref-type="fig">Figure 5</xref>. The good performance of CELL showed that it had an excellent capability to predict the true association with the use of unlabeled data. We reported the AUC ROC value of each type of association separately in <xref rid="figure6" ref-type="fig">Figure 6</xref>. Combining Figure S2 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> of edge degree of each type of association, we concluded that, as the degree is larger, there would be more noise contained in each edge. Thus, the results would be affected correspondingly. The average precision performance for our proposed models in our synthetic and real data sets can be found in Supplementary 3 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p>
          <fig id="figure5" position="float">
            <label>Figure 5</label>
            <caption>
              <p>Performance on real data set. ROC: receiver operating characteristic curve; CELL: Cross-Entropy Low-rank Logits.</p>
            </caption>
            <graphic xlink:href="jmir_v24i7e38584_fig5.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </fig>
          <fig id="figure6" position="float">
            <label>Figure 6</label>
            <caption>
              <p>AUC ROC (area under the receiver operating characteristic curve) score for different types of associations in the real data set. CELL: Cross-Entropy Low-rank Logits.</p>
            </caption>
            <graphic xlink:href="jmir_v24i7e38584_fig6.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </fig>
        </sec>
        <sec>
          <title>Denoised Knowledge Graph Generated From the Real Data Set</title>
          <p>We trained the adapted NetGAN with the whole real data set, and plotted the predicted denoised knowledge graph in <xref rid="figure7" ref-type="fig">Figure 7</xref>, where the edges are generated based on the score matrix calculated following the generation method used in NetGAN [<xref ref-type="bibr" rid="ref12">12</xref>]. There are a total of 21,016 edges in our visualization consisting of gene-chemical (7562), gene-disease (7613), and chemical-disease (5841). Three different colors (red, green, and blue) stand for three different types of associations/edges (gene-chemical, gene-disease, chem-disease). The source file for the prediction can be found at [<xref ref-type="bibr" rid="ref32">32</xref>].</p>
          <fig id="figure7" position="float">
            <label>Figure 7</label>
            <caption>
              <p>Visualization of the predicted knowledge graph based on the real COVID-19–related data set.</p>
            </caption>
            <graphic xlink:href="jmir_v24i7e38584_fig7.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </fig>
        </sec>
      </sec>
    </sec>
    <sec sec-type="discussion">
      <title>Discussion</title>
      <sec>
        <title>Principal Results</title>
        <p>In this study, we proposed a method to automate the denoising of a knowledge graph generated via the counting of co-occurrence from biomedical literature. Our work can be considered as the preprocessing part for the curation of the knowledge graph. We adopted state-of-the-art generative-based graph methods, NetGAN and CELL, to leverage the unlabeled co-occurring biomedical entities in the training process by the perturbation of the original graph in the determination of an unknown edge. Two data sets (ie, synthetic and real data sets) were used to evaluate proposed methods in 3 link prediction tasks, and our experiments achieved promising results with both synthetic and real data sets.</p>
      </sec>
      <sec>
        <title>Limitations</title>
        <p>Despite the capability and stability of the methods used in this study, there are a few limitations that need to be discussed.</p>
        <p>First, the associations labeled in the real data set are limited due to limited resources. In addition, due to the reality of vagueness or missing concepts in the biomedical literature, there will be some bias. A large sample of annotated associations may provide a solution to reduce this bias and thus is needed for our future work. One way to potentially accomplish this goal would be to use natural language processing methods to standardize the concepts prior to annotation, which may improve the construction of knowledge graph input to our methods. Another way includes collaborating with professional annotators to both increase the number of annotations as well as improve the quality.</p>
        <p>Second, while we have achieved notable improvement with AUC around 0.7 in our real data set compared with random classification, there is still a gap between the experimental results in a controlled environment compared to the adaptation of the proposed methods for data curation in real-world scenarios. Performance improvement is still needed. The complexity of our investigated algorithm comes from the module of LSTM, which generates random walks for reconstructing the graph. An adaptation of binary neural networks [<xref ref-type="bibr" rid="ref33">33</xref>] that directly produces the discrete adjacency matrix for the graph may have the potential to significantly improve the efficacy of our investigated methods as reconstruction of the adjacency matrix from random walks will not be needed. Another potential direction for improving the performance of removing noise in our investigated methods could be looking into the possibilities of transfer learning or external methods as we discussed previously, such as in [<xref ref-type="bibr" rid="ref34">34</xref>]. By importing prior knowledge into the process of graph generation, we could employ the knowledge from an already built data set [<xref ref-type="bibr" rid="ref35">35</xref>] to help us remove the false associations when constructing our biomedical graph.</p>
        <p>Third, our evaluation was based on the logic of classifying the true or false associations directly, and was intentionally not focused on the impact evaluation of the denoised data sets generated in our work on downstream applications (eg, prediction for drug-target association and protein-protein interaction). Although we assume the performance will be improved in those applications [<xref ref-type="bibr" rid="ref36">36</xref>], we acknowledge that there has not yet been any scientific proof to support that. The whole data stream, including the methods of data processing, data curation (ie, denoising method proposed in this study), and application, needs to be investigated further to fill this gap, which could provide convincing evidence of the impact of our proposed method for denoising knowledge base construction.</p>
      </sec>
    </sec>
  </body>
  <back>
    <app-group>
      <supplementary-material id="app1">
        <label>Multimedia Appendix 1</label>
        <p>Supplementary materials.</p>
        <media xlink:href="jmir_v24i7e38584_app1.docx" xlink:title="DOCX File , 411 KB"/>
      </supplementary-material>
    </app-group>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">AP</term>
          <def>
            <p>average precision</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb2">AR</term>
          <def>
            <p>annotation ratio</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb3">AUC</term>
          <def>
            <p>area under the curve</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb4">AUC ROC</term>
          <def>
            <p>area under the receiver operating characteristic curve</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb5">CELL</term>
          <def>
            <p>Cross-Entropy Low-rank Logits</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb6">CORD-19</term>
          <def>
            <p>COVID-19 Open Research Dataset</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb7">GAN</term>
          <def>
            <p>generative adversarial network</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb8">LSTM</term>
          <def>
            <p>long short-term memory</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <ack>
      <p>This work was supported by funding from the National Institutes of Health (NIH) National Institute of General Medical Sciences (NIGMS) (K99GM135488).</p>
    </ack>
    <fn-group>
      <fn fn-type="con">
        <p>NZ conceived and designed the study. CJ performed data integration, network construction, implementation of the algorithms, and experimentation, and created visualizations. NZ, CJ, and VN contributed to project implementation. NZ, CJ, and VN wrote the manuscript with contributions from all authors. All authors commented on and revised the manuscript.</p>
      </fn>
      <fn fn-type="conflict">
        <p>None declared.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="web">
          <article-title>Coronavirus disease (COVID-19)</article-title>
          <source>World Health Organization</source>
          <access-date>2021-08-31</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.who.int/emergencies/diseases/novel-coronavirus-2019">https://www.who.int/emergencies/diseases/novel-coronavirus-2019</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Allot</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Lu</surname>
              <given-names>Z</given-names>
            </name>
          </person-group>
          <article-title>Keep up with the latest coronavirus research</article-title>
          <source>Nature</source>
          <year>2020</year>
          <month>03</month>
          <day>10</day>
          <volume>579</volume>
          <issue>7798</issue>
          <fpage>193</fpage>
          <lpage>193</lpage>
          <pub-id pub-id-type="doi">10.1038/d41586-020-00694-1</pub-id>
          <pub-id pub-id-type="medline">32157233</pub-id>
          <pub-id pub-id-type="pii">10.1038/d41586-020-00694-1</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>El Mouden</surname>
              <given-names>ZA</given-names>
            </name>
            <name name-style="western">
              <surname>Taj</surname>
              <given-names>RM</given-names>
            </name>
            <name name-style="western">
              <surname>Jakimi</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Hajar</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Towards using graph analytics for tracking covid-19</article-title>
          <source>Procedia Comput Sci</source>
          <year>2020</year>
          <volume>177</volume>
          <fpage>204</fpage>
          <lpage>211</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://linkinghub.elsevier.com/retrieve/pii/S1877-0509(20)32296-1"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.procs.2020.10.029</pub-id>
          <pub-id pub-id-type="medline">33200008</pub-id>
          <pub-id pub-id-type="pii">S1877-0509(20)32296-1</pub-id>
          <pub-id pub-id-type="pmcid">PMC7657018</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Groza</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Detecting fake news for the new coronavirus by reasoning on the Covid-19 ontology</article-title>
          <source>ArXiv</source>
          <comment>Preprint posted online on April 26, 2020
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/2004.12330"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="web">
          <article-title>CORD-19-on-FHIR</article-title>
          <source>GitHub</source>
          <access-date>2021-09-27</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://github.com/fhircat/CORD-19-on-FHIR">https://github.com/fhircat/CORD-19-on-FHIR</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref6">
        <label>6</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Oniani</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Jiang</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Shen</surname>
              <given-names>F</given-names>
            </name>
          </person-group>
          <article-title>Constructing co-occurrence network embeddings to assist association extraction for COVID-19 and other coronavirus infectious diseases</article-title>
          <source>J Am Med Inform Assoc</source>
          <year>2020</year>
          <month>08</month>
          <day>01</day>
          <volume>27</volume>
          <issue>8</issue>
          <fpage>1259</fpage>
          <lpage>1267</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/32458963"/>
          </comment>
          <pub-id pub-id-type="doi">10.1093/jamia/ocaa117</pub-id>
          <pub-id pub-id-type="medline">32458963</pub-id>
          <pub-id pub-id-type="pii">5847598</pub-id>
          <pub-id pub-id-type="pmcid">PMC7314034</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref7">
        <label>7</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zong</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Wong</surname>
              <given-names>RSN</given-names>
            </name>
            <name name-style="western">
              <surname>Yu</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Wen</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Huang</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>N</given-names>
            </name>
          </person-group>
          <article-title>Drug-target prediction utilizing heterogeneous bio-linked network embeddings</article-title>
          <source>Brief Bioinform</source>
          <year>2021</year>
          <month>01</month>
          <day>18</day>
          <volume>22</volume>
          <issue>1</issue>
          <fpage>568</fpage>
          <lpage>580</lpage>
          <pub-id pub-id-type="doi">10.1093/bib/bbz147</pub-id>
          <pub-id pub-id-type="medline">31885036</pub-id>
          <pub-id pub-id-type="pii">5681788</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref8">
        <label>8</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zhou</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Amimeur</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Jiang</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Dou</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Jin</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>P</given-names>
            </name>
          </person-group>
          <article-title>Density-aware local Siamese autoencoder network embedding with autoencoder graph clustering</article-title>
          <year>2018</year>
          <month>10</month>
          <day>13</day>
          <conf-name>2018 IEEE International Conference on Big Data (Big Data)</conf-name>
          <conf-date>2018</conf-date>
          <conf-loc>Seattle, WA, USA</conf-loc>
          <pub-id pub-id-type="doi">10.1109/bigdata.2018.8621992</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref9">
        <label>9</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zhou</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Jiang</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Dou</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Jin</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>P</given-names>
            </name>
          </person-group>
          <article-title>Density-adaptive local edge representation learning with generative adversarial network multi-label edge classification</article-title>
          <year>2018</year>
          <month>11</month>
          <day>17</day>
          <conf-name>2018 IEEE International Conference on Data Mining (ICDM)</conf-name>
          <conf-date>2018</conf-date>
          <conf-loc>Singapore</conf-loc>
          <pub-id pub-id-type="doi">10.1109/icdm.2018.00203</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref10">
        <label>10</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Jiang</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Dou</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Jin</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>P</given-names>
            </name>
          </person-group>
          <article-title>Integrating local vertex/edge embedding via deep matrix fusion and siamese multi-label classification</article-title>
          <year>2019</year>
          <month>12</month>
          <day>9</day>
          <conf-name>2019 IEEE International Conference on Big Data (Big Data)</conf-name>
          <conf-date>2019</conf-date>
          <conf-loc>Los Angeles, CA, USA</conf-loc>
          <pub-id pub-id-type="doi">10.1109/bigdata47090.2019.9006299</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref11">
        <label>11</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Paulheim</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>Knowledge graph refinement: A survey of approaches and evaluation methods</article-title>
          <source>Semant Web</source>
          <year>2016</year>
          <month>12</month>
          <day>06</day>
          <volume>8</volume>
          <issue>3</issue>
          <fpage>489</fpage>
          <lpage>508</lpage>
          <pub-id pub-id-type="doi">10.3233/sw-160218</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref12">
        <label>12</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Pujara</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Miao</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Getoor</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Cohen</surname>
              <given-names>W</given-names>
            </name>
          </person-group>
          <article-title>Knowledge graph identification</article-title>
          <year>2013</year>
          <month>1</month>
          <day>1</day>
          <conf-name>ISWC 2013</conf-name>
          <conf-date>2013</conf-date>
          <conf-loc>Sydney, Australia</conf-loc>
          <fpage>542</fpage>
          <lpage>557</lpage>
          <pub-id pub-id-type="doi">10.1007/978-3-642-41335-3_34</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref13">
        <label>13</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wienand</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Paulheim</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>Detecting incorrect numerical data in DBpedia</article-title>
          <year>2014</year>
          <month>1</month>
          <day>1</day>
          <conf-name>ESWC 2014</conf-name>
          <conf-date>2014</conf-date>
          <conf-loc>Paris, France</conf-loc>
          <fpage>504</fpage>
          <lpage>518</lpage>
          <pub-id pub-id-type="doi">10.1007/978-3-319-07443-6_34</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref14">
        <label>14</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Long</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Fang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Kwoh</surname>
              <given-names>CK</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Luo</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>X</given-names>
            </name>
          </person-group>
          <article-title>Pre-training graph neural networks for link prediction in biomedical networks</article-title>
          <source>Bioinformatics</source>
          <year>2022</year>
          <month>02</month>
          <day>16</day>
          <fpage>2254</fpage>
          <lpage>2262</lpage>
          <pub-id pub-id-type="doi">10.1093/bioinformatics/btac100</pub-id>
          <pub-id pub-id-type="medline">35171981</pub-id>
          <pub-id pub-id-type="pii">6529539</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref15">
        <label>15</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Yan</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Yu</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Shi</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Mou</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Kalra</surname>
              <given-names>MK</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Sun</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>G</given-names>
            </name>
          </person-group>
          <article-title>Low-dose CT image denoising using a generative adversarial network with Wasserstein distance and perceptual loss</article-title>
          <source>IEEE Trans Med Imaging</source>
          <year>2018</year>
          <month>6</month>
          <volume>37</volume>
          <issue>6</issue>
          <fpage>1348</fpage>
          <lpage>1357</lpage>
          <pub-id pub-id-type="doi">10.1109/tmi.2018.2827462</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref16">
        <label>16</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zhou</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Guo</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Qi</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Yu</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Image quality improvement of hand-held ultrasound devices with a two-stage generative adversarial network</article-title>
          <source>IEEE Trans Biomed Eng</source>
          <year>2020</year>
          <month>1</month>
          <volume>67</volume>
          <issue>1</issue>
          <fpage>298</fpage>
          <lpage>311</lpage>
          <pub-id pub-id-type="doi">10.1109/tbme.2019.2912986</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref17">
        <label>17</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Choi</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Biswal</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Malin</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Duke</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Stewart</surname>
              <given-names>WF</given-names>
            </name>
            <name name-style="western">
              <surname>Sun</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Generating multi-label discrete patient records using generative adversarial networks</article-title>
          <year>2017</year>
          <conf-name>Machine Learning for Healthcare 2017</conf-name>
          <conf-date>2017</conf-date>
          <conf-loc>Boston, Massachusetts</conf-loc>
          <fpage>305</fpage>
        </nlm-citation>
      </ref>
      <ref id="ref18">
        <label>18</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Li</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Cai</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Arnold</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Zhao</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Yuan</surname>
              <given-names>Y</given-names>
            </name>
          </person-group>
          <article-title>Semi-supervised Rare Disease Detection Using Generative Adversarial Network</article-title>
          <source>ArXiv</source>
          <comment>Preprint posted online on December 3, 2018
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/1812.00547"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref19">
        <label>19</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Rendsburg</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Heidrich</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Von Luxburg</surname>
              <given-names>U</given-names>
            </name>
          </person-group>
          <article-title>NetGAN without GAN: from random walks to low-rank approximations</article-title>
          <year>2020</year>
          <conf-name>37th International Conference on Machine Learning</conf-name>
          <conf-date>2020</conf-date>
          <conf-loc>Virtual</conf-loc>
        </nlm-citation>
      </ref>
      <ref id="ref20">
        <label>20</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Bojchevski</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Shchur</surname>
              <given-names>O</given-names>
            </name>
            <name name-style="western">
              <surname>Zügner</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Günnemann</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>NetGAN: generating graphs via random walks</article-title>
          <year>2018</year>
          <conf-name>35th International Conference on Machine Learning</conf-name>
          <conf-date>2018</conf-date>
          <conf-loc>Stockholm, Sweden</conf-loc>
        </nlm-citation>
      </ref>
      <ref id="ref21">
        <label>21</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>McCallum</surname>
              <given-names>AK</given-names>
            </name>
            <name name-style="western">
              <surname>Nigam</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Rennie</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Seymore</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>Automating the construction of internet portals with machine learning</article-title>
          <source>Information Retrieval</source>
          <year>2000</year>
          <volume>3</volume>
          <fpage>127</fpage>
          <lpage>163</lpage>
          <pub-id pub-id-type="doi">10.1023/A:1009953814988</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref22">
        <label>22</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Yao</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Kwok</surname>
              <given-names>JT</given-names>
            </name>
            <name name-style="western">
              <surname>Ni</surname>
              <given-names>LM</given-names>
            </name>
          </person-group>
          <article-title>Generalizing from a few examples</article-title>
          <source>ACM Comput Surv</source>
          <year>2021</year>
          <month>05</month>
          <day>31</day>
          <volume>53</volume>
          <issue>3</issue>
          <fpage>1</fpage>
          <lpage>34</lpage>
          <pub-id pub-id-type="doi">10.1145/3386252</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref23">
        <label>23</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hochreiter</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Schmidhuber</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Long short-term memory</article-title>
          <source>Neural Comput</source>
          <year>1997</year>
          <month>11</month>
          <day>15</day>
          <volume>9</volume>
          <issue>8</issue>
          <fpage>1735</fpage>
          <lpage>80</lpage>
          <pub-id pub-id-type="doi">10.1162/neco.1997.9.8.1735</pub-id>
          <pub-id pub-id-type="medline">9377276</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref24">
        <label>24</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Arjovsky</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Chintala</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Bottou</surname>
              <given-names>L</given-names>
            </name>
          </person-group>
          <article-title>Wasserstein generative adversarial networks</article-title>
          <year>2017</year>
          <conf-name>ICML'17: Proceedings of the 34th International Conference on Machine Learning</conf-name>
          <conf-date>2017</conf-date>
          <conf-loc>Sydney, NSW, Australia</conf-loc>
        </nlm-citation>
      </ref>
      <ref id="ref25">
        <label>25</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Grover</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Leskovec</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>node2vec: Scalable feature learning for networks</article-title>
          <year>2016</year>
          <conf-name>KDD ’16</conf-name>
          <conf-date>2016</conf-date>
          <conf-loc>San Francisco, CA, USA</conf-loc>
          <pub-id pub-id-type="doi">10.1145/2939672.2939754</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref26">
        <label>26</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Allot</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Lu</surname>
              <given-names>Z</given-names>
            </name>
          </person-group>
          <article-title>LitCovid: an open database of COVID-19 literature</article-title>
          <source>Nucleic Acids Res</source>
          <year>2021</year>
          <month>01</month>
          <day>08</day>
          <volume>49</volume>
          <issue>D1</issue>
          <fpage>D1534</fpage>
          <lpage>D1540</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/33166392"/>
          </comment>
          <pub-id pub-id-type="doi">10.1093/nar/gkaa952</pub-id>
          <pub-id pub-id-type="medline">33166392</pub-id>
          <pub-id pub-id-type="pii">5964074</pub-id>
          <pub-id pub-id-type="pmcid">PMC7778958</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref27">
        <label>27</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wei</surname>
              <given-names>CH</given-names>
            </name>
            <name name-style="western">
              <surname>Kao</surname>
              <given-names>HY</given-names>
            </name>
            <name name-style="western">
              <surname>Lu</surname>
              <given-names>Z</given-names>
            </name>
          </person-group>
          <article-title>PubTator: a web-based text mining tool for assisting biocuration</article-title>
          <source>Nucleic Acids Res</source>
          <year>2013</year>
          <month>07</month>
          <volume>41</volume>
          <issue>Web Server issue</issue>
          <fpage>W518</fpage>
          <lpage>22</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/23703206"/>
          </comment>
          <pub-id pub-id-type="doi">10.1093/nar/gkt441</pub-id>
          <pub-id pub-id-type="medline">23703206</pub-id>
          <pub-id pub-id-type="pii">gkt441</pub-id>
          <pub-id pub-id-type="pmcid">PMC3692066</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref28">
        <label>28</label>
        <nlm-citation citation-type="web">
          <article-title>Implementation of the paper 'NetGAN: Generating Graphs via Random Walks'</article-title>
          <source>GitHub</source>
          <access-date>2021-11-28</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://github.com/danielzuegner/netgan">https://github.com/danielzuegner/netgan</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref29">
        <label>29</label>
        <nlm-citation citation-type="web">
          <article-title>Repository of the ICML 2020 paper NetGAN without GAN: From Random Walks to Low-Rank Approximations</article-title>
          <source>GitHub</source>
          <year>2020</year>
          <access-date>2021-11-28</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://github.com/hheidrich/CELL">https://github.com/hheidrich/CELL</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref30">
        <label>30</label>
        <nlm-citation citation-type="web">
          <article-title>scikit-learn: machine learning in Python</article-title>
          <source>GitHub</source>
          <access-date>2021-11-28</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://github.com/scikit-learn/scikit-learn">https://github.com/scikit-learn/scikit-learn</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref31">
        <label>31</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Shannon</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Markiel</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Ozier</surname>
              <given-names>O</given-names>
            </name>
            <name name-style="western">
              <surname>Baliga</surname>
              <given-names>NS</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>JT</given-names>
            </name>
            <name name-style="western">
              <surname>Ramage</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Amin</surname>
              <given-names>Nada</given-names>
            </name>
            <name name-style="western">
              <surname>Schwikowski</surname>
              <given-names>Benno</given-names>
            </name>
            <name name-style="western">
              <surname>Ideker</surname>
              <given-names>Trey</given-names>
            </name>
          </person-group>
          <article-title>Cytoscape: a software environment for integrated models of biomolecular interaction networks</article-title>
          <source>Genome Res</source>
          <year>2003</year>
          <month>11</month>
          <volume>13</volume>
          <issue>11</issue>
          <fpage>2498</fpage>
          <lpage>504</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://genome.cshlp.org/cgi/pmidlookup?view=long&#38;pmid=14597658"/>
          </comment>
          <pub-id pub-id-type="doi">10.1101/gr.1239303</pub-id>
          <pub-id pub-id-type="medline">14597658</pub-id>
          <pub-id pub-id-type="pii">13/11/2498</pub-id>
          <pub-id pub-id-type="pmcid">PMC403769</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref32">
        <label>32</label>
        <nlm-citation citation-type="web">
          <article-title>Deep denoising of raw biomedical knowledge graph from COVID-19 literature, LitCovid and Pubtator</article-title>
          <source>GitHub</source>
          <access-date>2021-10-04</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://github.com/bioIKEA/deep_denoising_knowledge_graph">https://github.com/bioIKEA/deep_denoising_knowledge_graph</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref33">
        <label>33</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Dong</surname>
              <given-names>HW</given-names>
            </name>
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>YH</given-names>
            </name>
          </person-group>
          <article-title>Training generative adversarial networks with binary neurons by end-to-end backpropagation</article-title>
          <source>ArXiv</source>
          <comment>Preprint posted online on October 10, 2018
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/1810.04714"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref34">
        <label>34</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Jin</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Barzilay</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Jaakkola</surname>
              <given-names>T</given-names>
            </name>
          </person-group>
          <article-title>Junction tree variational autoencoder for molecular graph generation</article-title>
          <year>2018</year>
          <conf-name>35th International Conference on Machine Learning</conf-name>
          <conf-date>2018</conf-date>
          <conf-loc>Stockholm, Sweden</conf-loc>
        </nlm-citation>
      </ref>
      <ref id="ref35">
        <label>35</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Rossanez</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Dos Reis</surname>
              <given-names>JC</given-names>
            </name>
            <name name-style="western">
              <surname>da Silva Torres</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>de Ribaupierre</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>KGen: a knowledge graph generator from biomedical scientific literature</article-title>
          <source>BMC Med Inform Decis Mak</source>
          <year>2020</year>
          <month>12</month>
          <day>14</day>
          <volume>20</volume>
          <issue>Suppl 4</issue>
          <fpage>314</fpage>
          <lpage>328</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://bmcmedinformdecismak.biomedcentral.com/articles/10.1186/s12911-020-01341-5"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/s12911-020-01341-5</pub-id>
          <pub-id pub-id-type="medline">33317512</pub-id>
          <pub-id pub-id-type="pii">10.1186/s12911-020-01341-5</pub-id>
          <pub-id pub-id-type="pmcid">PMC7734730</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref36">
        <label>36</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Algan</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Ulusoy</surname>
              <given-names>I</given-names>
            </name>
          </person-group>
          <article-title>Image classification with deep learning in the presence of noisy labels: A survey</article-title>
          <source>Knowledge-Based Systems</source>
          <year>2021</year>
          <month>03</month>
          <volume>215</volume>
          <fpage>106771</fpage>
          <pub-id pub-id-type="doi">10.1016/j.knosys.2021.106771</pub-id>
        </nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
