<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article" dtd-version="2.0">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JMIR</journal-id>
      <journal-id journal-id-type="nlm-ta">J Med Internet Res</journal-id>
      <journal-title>Journal of Medical Internet Research</journal-title>
      <issn pub-type="epub">1438-8871</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
      <article-id pub-id-type="publisher-id">v22i7e17853</article-id>
      <article-id pub-id-type="pmid">32706701</article-id>
      <article-id pub-id-type="doi">10.2196/17853</article-id>
      <article-categories>
        <subj-group subj-group-type="heading">
          <subject>Original Paper</subject>
        </subj-group>
        <subj-group subj-group-type="article-type">
          <subject>Original Paper</subject>
        </subj-group>
      </article-categories>
      <title-group>
        <article-title>Crawling the German Health Web: Exploratory Study and Graph Analysis</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="editor">
          <name>
            <surname>Eysenbach</surname>
            <given-names>Gunther</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Nagel</surname>
            <given-names>Sebastian</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Sánchez Bocanegra</surname>
            <given-names>Carlos Luis</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib id="contrib1" contrib-type="author" corresp="yes">
          <name name-style="western">
            <surname>Zowalla</surname>
            <given-names>Richard</given-names>
          </name>
          <degrees>MSc</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <address>
            <institution>Department of Medical Informatics</institution>
            <institution>Heilbronn University</institution>
            <addr-line>Max-Planck-Str 39</addr-line>
            <addr-line>Heilbronn, </addr-line>
            <country>Germany</country>
            <phone>49 713 150 46791</phone>
            <email>richard.zowalla@hs-heilbronn.de</email>
          </address>
          <xref rid="aff2" ref-type="aff">2</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-1236-7398</ext-link>
        </contrib>
        <contrib id="contrib2" contrib-type="author">
          <name name-style="western">
            <surname>Wetter</surname>
            <given-names>Thomas</given-names>
          </name>
          <degrees>Dr rer nat, Dipl Math</degrees>
          <xref rid="aff3" ref-type="aff">3</xref>
          <xref rid="aff4" ref-type="aff">4</xref>
          <xref rid="aff5" ref-type="aff">5</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-9699-9709</ext-link>
        </contrib>
        <contrib id="contrib3" contrib-type="author">
          <name name-style="western">
            <surname>Pfeifer</surname>
            <given-names>Daniel</given-names>
          </name>
          <degrees>Dr Ing</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <xref rid="aff2" ref-type="aff">2</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-1794-1184</ext-link>
        </contrib>
      </contrib-group>
      <aff id="aff1">
        <label>1</label>
        <institution>Department of Medical Informatics</institution>
        <institution>Heilbronn University</institution>
        <addr-line>Heilbronn</addr-line>
        <country>Germany</country>
      </aff>
      <aff id="aff2">
        <label>2</label>
        <institution>Center for Machine Learning</institution>
        <institution>Heilbronn University</institution>
        <addr-line>Heilbronn</addr-line>
        <country>Germany</country>
      </aff>
      <aff id="aff3">
        <label>3</label>
        <institution>Institute of Medical Biometry and Informatics</institution>
        <institution>Heidelberg University</institution>
        <addr-line>Heidelberg</addr-line>
        <country>Germany</country>
      </aff>
      <aff id="aff4">
        <label>4</label>
        <institution>Institute of Software and Information Systems Engineering</institution>
        <institution>Ben-Gurion University of the Negev</institution>
        <addr-line>Beer Sheva</addr-line>
        <country>Israel</country>
      </aff>
      <aff id="aff5">
        <label>5</label>
        <institution>Department of Biomedical Informatics and Medical Education</institution>
        <institution>University of Washington</institution>
        <addr-line>Seattle, WA</addr-line>
        <country>United States</country>
      </aff>
      <author-notes>
        <corresp>Corresponding Author: Richard Zowalla <email>richard.zowalla@hs-heilbronn.de</email></corresp>
      </author-notes>
      <pub-date pub-type="collection">
        <month>7</month>
        <year>2020</year>
      </pub-date>
      <pub-date pub-type="epub">
        <day>24</day>
        <month>7</month>
        <year>2020</year>
      </pub-date>
      <volume>22</volume>
      <issue>7</issue>
      <elocation-id>e17853</elocation-id>
      <history>
        <date date-type="received">
          <day>16</day>
          <month>1</month>
          <year>2020</year>
        </date>
        <date date-type="rev-request">
          <day>16</day>
          <month>3</month>
          <year>2020</year>
        </date>
        <date date-type="rev-recd">
          <day>25</day>
          <month>3</month>
          <year>2020</year>
        </date>
        <date date-type="accepted">
          <day>14</day>
          <month>5</month>
          <year>2020</year>
        </date>
      </history>
      <copyright-statement>©Richard Zowalla, Thomas Wetter, Daniel Pfeifer. Originally published in the Journal of Medical Internet Research (http://www.jmir.org), 24.07.2020.</copyright-statement>
      <copyright-year>2020</copyright-year>
      <license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
        <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (https://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in the Journal of Medical Internet Research, is properly cited. The complete bibliographic information, a link to the original publication on http://www.jmir.org/, as well as this copyright and license information must be included.</p>
      </license>
      <self-uri xlink:href="http://www.jmir.org/2020/7/e17853/" xlink:type="simple"/>
      <abstract>
        <sec sec-type="background">
          <title>Background</title>
          <p>The internet has become an increasingly important resource for health information. However, with a growing amount of web pages, it is nearly impossible for humans to manually keep track of evolving and continuously changing content in the health domain. To better understand the nature of all web-based health information as given in a specific language, it is important to identify (1) information hubs for the health domain, (2) content providers of high prestige, and (3) important topics and trends in the health-related web. In this context, an automatic web crawling approach can provide the necessary data for a computational and statistical analysis to answer (1) to (3).</p>
        </sec>
        <sec sec-type="objective">
          <title>Objective</title>
          <p>This study demonstrates the suitability of a focused crawler for the acquisition of the German Health Web (GHW) which includes all health-related web content of the three mostly German speaking countries Germany, Austria and Switzerland. Based on the gathered data, we provide a preliminary analysis of the GHW’s graph structure covering its size, most important content providers and a ratio of public to private stakeholders. In addition, we provide our experiences in building and operating such a highly scalable crawler.</p>
        </sec>
        <sec sec-type="methods">
          <title>Methods</title>
          <p>A support vector machine classifier was trained on a large data set acquired from various German content providers to distinguish between health-related and non–health-related web pages. The classifier was evaluated using accuracy, recall and precision on an 80/20 training/test split (TD1) and against a crowd-validated data set (TD2). To implement the crawler, we extended the open-source framework StormCrawler. The actual crawl was conducted for 227 days. The crawler was evaluated by using harvest rate and its recall was estimated using a seed-target approach.</p>
        </sec>
        <sec sec-type="results">
          <title>Results</title>
          <p>In total, n=22,405 seed URLs with country-code top level domains .de: 85.36% (19,126/22,405), .at: 6.83% (1530/22,405), .ch: 7.81% (1749/22,405), were collected from Curlie and a previous crawl. The text classifier achieved an accuracy on TD1 of 0.937 (TD2=0.966), a precision on TD1 of 0.934 (TD2=0.954) and a recall on TD1 of 0.944 (TD2=0.989). The crawl yields 13.5 million presumably relevant and 119.5 million nonrelevant web pages. The average harvest rate was 19.76%; recall was 0.821 (4105/5000 targets found). The resulting host-aggregated graph contains 215,372 nodes and 403,175 edges (network diameter=25; average path length=6.466; average degree=1.872; average in-degree=1.892; average out-degree=1.845; modularity=0.723). Among the 25 top-ranked pages for each country (according to PageRank), 40% (30/75) were web sites published by public institutions. 25% (19/75) were published by nonprofit organizations and 35% (26/75) by private organizations or individuals.</p>
        </sec>
        <sec sec-type="conclusions">
          <title>Conclusions</title>
          <p>The results indicate, that the presented crawler is a suitable method for acquiring a large fraction of the GHW. As desired, the computed statistical data allows for determining major information hubs and important content providers on the GHW. In the future, the acquired data may be used to assess important topics and trends but also to build health-specific search engines.</p>
        </sec>
      </abstract>
      <kwd-group>
        <kwd>health information</kwd>
        <kwd>internet</kwd>
        <kwd>web crawling</kwd>
        <kwd>distributed system</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <sec sec-type="introduction">
      <title>Introduction</title>
      <sec>
        <title>Overview</title>
        <p>The internet has become an increasingly important resource for health information, especially for laypeople [<xref ref-type="bibr" rid="ref1">1</xref>-<xref ref-type="bibr" rid="ref10">10</xref>]. Web users perform online searches to obtain health information regarding diseases, diagnoses, and different treatments [<xref ref-type="bibr" rid="ref1">1</xref>]. However, with a growing amount of web pages, it is nearly impossible for humans to manually keep track of evolving and continuously changing content in the health domain. According to the (German) Good Practice Guidelines for Health Information, “evidence-based health information is...a trustworthy state of the medical knowledge” [<xref ref-type="bibr" rid="ref11">11</xref>]. Even if health information is found via well-known search engines, it does not necessarily meet with this definition and may be influenced by commercial interests [<xref ref-type="bibr" rid="ref12">12</xref>].</p>
        <p>Therefore, it is important to identify health content providers and assess their relevance [<xref ref-type="bibr" rid="ref13">13</xref>]. In this context, an automatic web crawling approach can help to understand the structure of the health-related web (ie all web pages offering health-related information). By focusing only on such content, it is possible to (1) identify information hubs for the health domain, (2) find content providers of high prestige, and (3) identify important topics and trends within the health-related web. In future work, the identified content providers of high prestige could be analyzed for their respective trustworthiness and their compliance with the criteria of evidence-based health information [<xref ref-type="bibr" rid="ref11">11</xref>].</p>
        <p>According to Van der Bosch et al [<xref ref-type="bibr" rid="ref14">14</xref>], in 2015 the (indexed) web was estimated to consist of roughly 47 billion web pages. However, only a fraction of those web pages contain health-related information. So, in order to determine the structure of the health-related web, it is crucial to determine for each web page’s content whether it is health-related or not.</p>
        <p>A related filter method can be used within a web crawler to filter out irrelevant web pages, therefore reducing the total number of web pages that need to be crawled. This saves time and financial resources for the crawling task. Nevertheless, analyzing such an amount of data requires high performance hardware and parallelization approaches.</p>
        <p>Yet, to the best of the authors’ knowledge, no study has been previously conducted and published about the health-related web. This study provides a first analysis of the health-related web limited to web pages in German, the so-called German health web (GHW). In this regard, we restrict our study to the three mostly German-speaking countries Germany, Austria and Switzerland (D-A-CH).</p>
        <p>A distributed focused crawler for the GHW is outlined and evaluated as part of this study. Using the acquired data it is possible to extract the graph structure of the GHW for the goals listed above and provide access to health-related text material for linguistic analysis and further research purposes.</p>
      </sec>
      <sec>
        <title>Related Work</title>
        <sec>
          <title>Importance of Health Information on the Web</title>
          <p>The World Wide Web and its graph structure have been a subject of study for many years [<xref ref-type="bibr" rid="ref15">15</xref>-<xref ref-type="bibr" rid="ref17">17</xref>]. However, domain-specific and/or country-dependent analysis of graph properties have not been the primary scope of research in the recent years [<xref ref-type="bibr" rid="ref18">18</xref>,<xref ref-type="bibr" rid="ref19">19</xref>]. Moreover, a review by Kumar et al [<xref ref-type="bibr" rid="ref20">20</xref>] shows that research related to focused crawling was popular in the late 1990s and mid 2000s but seems to have lost attention in the last decade. As the internet is an important resource for health information [<xref ref-type="bibr" rid="ref21">21</xref>], finding relevant content remains an important task [<xref ref-type="bibr" rid="ref8">8</xref>].</p>
        </sec>
        <sec>
          <title>Web Crawling of Health Information</title>
          <p>In 2005, Tang et al [<xref ref-type="bibr" rid="ref22">22</xref>] investigated the use of focused crawling techniques to assess the topic relevance and quality of medical information. For this purpose, n=160 seeds from the category depression of the Open Directory Project (now Curlie) were selected. They found that such an approach fetches twice as many pages as a crawler without topic focus. In another study, Pirkola et al [<xref ref-type="bibr" rid="ref23">23</xref>] described the use of focused crawlers to acquire text from the genomics domain. They found, that “the source of seed URLs and the issues related to the multilinguality of the web” are major challenges in this area.</p>
          <p>Abbasi et al [<xref ref-type="bibr" rid="ref24">24</xref>] used a focused crawler to collect credible medical sentiments and opinions in the context of drug surveillance. In this context, their crawler was evaluated on “a set of 100 seed URLs pertaining to health and drug-related websites” and achieved a harvest rate of 10.06% (1,243,074/12,362,406). In 2016, Abbasi et al [<xref ref-type="bibr" rid="ref25">25</xref>] demonstrated the use of a focused crawler to acquire credible online medical content in the context of postmarket drug surveillance. Their method was able to “collect over 80% of all relevant credible content in the first 20% of the crawl.”</p>
          <p>In Xu et al [<xref ref-type="bibr" rid="ref26">26</xref>], a user-oriented adaptive focused crawler was implemented and applied in the cancer domain (ie, on breast and lung cancer). The authors found “that the new crawler can substantially accelerate the online user-generated content acquisition efforts for cancer researchers.”</p>
          <p>Amalia et al [<xref ref-type="bibr" rid="ref27">27</xref>] presented a focused crawler for the acquisition of health-related articles written in Indonesian. In this study, different crawling strategies and their relative impacts on crawler performance were investigated. They found that crawling larger sites first improves the number of crawled articles.</p>
          <p>In 2016, Rheinländer et al [<xref ref-type="bibr" rid="ref28">28</xref>] studied the scalability of an information extraction framework using a focused crawling approach to collect and analyze “a 1 TB collection of web text from the biomedical domain” written in English. For this purpose, they generated a set of n=485,462 seeds using commercial search engines with which their focused crawler achieved a harvest rate of 38%.</p>
        </sec>
      </sec>
      <sec>
        <title>Aims of the Study</title>
        <p>The authors decided to concentrate on health-related web pages available free of charge on the internet in the D-A-CH region that can be found under the respective country-code top-level domains (ccTLDs) .de, .at, and .ch. In this context, the aim of this study was fourfold:</p>
        <list list-type="bullet">
          <list-item>
            <p>Demonstrate the suitability of a focused crawler approach for the acquisition of health-related content in the D-A-CH region</p>
          </list-item>
          <list-item>
            <p>Provide a curated list of seed points for the health domain in the D-A-CH region</p>
          </list-item>
          <list-item>
            <p>Provide a crowd-validated evaluation data set consisting of health-related and non–health-related URLs that can be used to evaluate other classifiers used in focused crawlers for the health domain in the D-A-CH region</p>
          </list-item>
          <list-item>
            <p>Give preliminary insights into the graph structure of the GHW</p>
          </list-item>
        </list>
        <p>To the best of the authors’ knowledge, no similar study has been previously conducted on a large scale. In particular, this has not been done for the GHW.</p>
        <p>Besides a statistical analysis of the GHW, this paper shares our experience in building and operating a highly scalable focused crawler. Thus, researchers who want to perform a similar analysis for web pages of the health domain in their country can benefit from the experiences gained.</p>
      </sec>
    </sec>
    <sec sec-type="methods">
      <title>Methods</title>
      <sec>
        <title>Focused Web Crawling</title>
        <sec>
          <title>Basic Web Crawling Process</title>
          <p>As depicted in <xref rid="figure1" ref-type="fig">Figure 1</xref>, a web crawler traverses the directed graph of the web [<xref ref-type="bibr" rid="ref29">29</xref>,<xref ref-type="bibr" rid="ref30">30</xref>]. Starting from a given set of seed URLs, the web crawler fetches web pages. After the download is successful, the HTML of a web page is parsed and hyperlinks to other web pages are extracted. These links are then analyzed and added in a priority queue called frontier [<xref ref-type="bibr" rid="ref30">30</xref>,<xref ref-type="bibr" rid="ref31">31</xref>]. The web graph is then visited via those URLs kept in the frontier. The crawler repeats this process until the frontier is empty or it is stopped manually.</p>
          <p>Due to the enormous size of the web [<xref ref-type="bibr" rid="ref14">14</xref>], one must focus on a certain domain of interest to speed up the crawl. In this context, a focused crawler only visits those outgoing links of a web site that appear to be relevant for the given topic. To determine whether a link is relevant or not, the assumption is made that web pages of a certain topic are most likely linked to other web pages of the same topic [<xref ref-type="bibr" rid="ref32">32</xref>]. To assess the relevance of a certain web page, a focused crawler often uses techniques from the field of machine learning [<xref ref-type="bibr" rid="ref16">16</xref>,<xref ref-type="bibr" rid="ref31">31</xref>]. Classifiers are then leveraged to filter irrelevant content during the crawl process and assign priority on extracted URLs based on the classification result.</p>
          <fig id="figure1" position="float">
            <label>Figure 1</label>
            <caption>
              <p>Schematic representation of the web graph traversal by a crawler. Pages colored in blue represent processed pages; in green, pages referenced in the frontier; in gray, undiscovered web content. Pages in dashed blue represent so-called initial seed pages.</p>
            </caption>
            <graphic xlink:href="jmir_v22i7e17853_fig1.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </fig>
        </sec>
        <sec>
          <title>System Architecture and Processing Workflow</title>
          <p>Given the results of the Van der Bosch study [<xref ref-type="bibr" rid="ref14">14</xref>], it is obvious that sequential processing of such an amount of data would take a tremendous amount of time and/or financial resources. For this reason, a parallel and distributed system architecture as described by Shkapenyuk [<xref ref-type="bibr" rid="ref33">33</xref>] is necessary to crawl within a reasonable amount of time: to make results available before the web has notably changed. Therefore, such an architecture must be designed to handle thousands of worker threads to fetch web pages in parallel. Besides efficiency in terms of throughput, a crawler should also respect crawler ethics [<xref ref-type="bibr" rid="ref29">29</xref>,<xref ref-type="bibr" rid="ref34">34</xref>] (ie, it should honor the robot exclusion protocol [robots.txt]) [<xref ref-type="bibr" rid="ref29">29</xref>,<xref ref-type="bibr" rid="ref35">35</xref>,<xref ref-type="bibr" rid="ref36">36</xref>]; this protocol allows web site administrators to inform the web crawler which parts of a web site should not be processed. In addition, a crawler should not overwhelm the target web server by sending too many request in a short period of time. For this reason, applying a politeness delay (time between requests to the same server) is mandatory. Furthermore, it must be robust protection against so-called spider traps, or web sites containing programmatic errors or dynamically generated links that cause the crawler to be trapped in an infinite loop [<xref ref-type="bibr" rid="ref29">29</xref>]. Moreover, the HTML parser must tolerate broken and/or invalid markup [<xref ref-type="bibr" rid="ref29">29</xref>,<xref ref-type="bibr" rid="ref37">37</xref>,<xref ref-type="bibr" rid="ref38">38</xref>]. In addition, text extraction components must handle boilerplate detection in an appropriate way [<xref ref-type="bibr" rid="ref39">39</xref>,<xref ref-type="bibr" rid="ref40">40</xref>].</p>
          <p>There are several frameworks that realize such distributed crawlers; we built our system on top of the open-source framework StormCrawler [<xref ref-type="bibr" rid="ref41">41</xref>], a software development kit for building low-latency, scalable crawlers based on the Apache Storm framework [<xref ref-type="bibr" rid="ref42">42</xref>]. It lacks out-of-the-box components for focused crawling but offers the possibility of adding custom extensions and configuration options. For this reason, we extended it with classifiers and the necessary logic to implement a focused crawler. <xref rid="figure2" ref-type="fig">Figure 2</xref> depicts the architecture of StormCrawler (black) with our focused crawler extension (orange).</p>
          <p>The StormCrawler software development kit provides a conventional recursive crawler architecture (upper part of <xref rid="figure2" ref-type="fig">Figure 2</xref>); a seed injector is used to read URLs from a text file and adds them to the CrawlDB, which acts as the crawl frontier and content storage. Next, a set of spouts emit yet unseen URLs from the crawl database. To maintain politeness, these URLs are then assigned to cluster nodes (based on their resolved hostname) and directed to the fetchers. The latter will download the respective web pages and forward them to the parsers for link and content extraction; unseen URLs are added to the frontier. Next, the content is sent to the indexers, which store it inside the CrawlDB (in this case an Elasticsearch cluster [<xref ref-type="bibr" rid="ref43">43</xref>]).</p>
          <p>To add focus to StormCrawler, the framework was extended by adding additional bolts and filter components (lower part of <xref rid="figure2" ref-type="fig">Figure 2</xref>). After a web page is parsed, the raw text is extracted by using boilerplate detection and XML path language expressions. It is then processed by a text classification pipeline to compute the relevance to the health domain as described by Joachims [<xref ref-type="bibr" rid="ref44">44</xref>] and Zowalla et al [<xref ref-type="bibr" rid="ref45">45</xref>]. If a web page is classified as relevant, it is marked for further processing.</p>
          <p>Next, a priority value (in this case a value between 0 and 127) is assigned to every URL contained on the given web page [<xref ref-type="bibr" rid="ref46">46</xref>]. This is done by using (1) the class probability of the current web page [<xref ref-type="bibr" rid="ref29">29</xref>,<xref ref-type="bibr" rid="ref32">32</xref>], (2) a check whether the extracted URLs target the same hostname (a web site covering a certain topic will most likely contain more web pages of that topic) [<xref ref-type="bibr" rid="ref29">29</xref>,<xref ref-type="bibr" rid="ref32">32</xref>], (3) the anchor text of that link [<xref ref-type="bibr" rid="ref47">47</xref>], and (4) the link itself using an n-gram approach [<xref ref-type="bibr" rid="ref48">48</xref>]. Higher priority values will guarantee earlier processing.</p>
          <p>In addition, we implemented a soft focused crawling strategy using tunneling to avoid stopping at the first irrelevant page. For example, many front pages of portals may be classified as irrelevant but link to relevant health-related content [<xref ref-type="bibr" rid="ref28">28</xref>,<xref ref-type="bibr" rid="ref31">31</xref>]. To do so, a specific filter component tracks the depth and stops after given n steps (eg, n=2, n=3). Irrelevant web pages are not indexed.</p>
          <p>To build the web graph of the health domain, during the crawl process a specific bolt keeps track of the visited and discovered links and adds them to a clustered Neo4J graph database. For statistics and metrics related to the crawl, another bolt continuously updates the crawling progress inside a PostgreSQL database. The crawling and classification process is repeated until the frontier is empty or it is stopped manually by the user.</p>
          <fig id="figure2" position="float">
            <label>Figure 2</label>
            <caption>
              <p>Architecture of a focused crawler based on the StormCrawler software development kit. Spouts (tap symbol) emit data (here: URLs), bolts (lightning symbol) process data (ie fetch, parse, and store the extracted content). Bolts can be enhanced with URL filters (white filter symbol) or parse filters (black filter symbol). URL filters are used to remove URLs based on predefined criteria. Parse filters include URL filters but are primarily used to clean the parsed content and compute topic relevance and priority.</p>
            </caption>
            <graphic xlink:href="jmir_v22i7e17853_fig2.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </fig>
        </sec>
        <sec>
          <title>System Environment and Hardware Setup</title>
          <p>In total, 22 virtual machines participate in the computing cluster providing the infrastructure for the crawler. The corresponding services are used to run, manage, and analyze the crawled web pages on the fly. For this setup, two physical servers of a Cisco unified computing system provide the computational resources and run as a virtualization platform to allow shared resource allocation.</p>
          <p>Each server offers two physical central processing units (Intel Xeon E5-2689) with 8 cores each and 256 GB of memory. On the network side, the Cisco unified computing system is attached to two optical 10 gigabit ethernet fibers that provide high bandwidth and ensure scalable throughput. A network attached storage system provides a total disc capacity of 60 TB to persist crawled data and store participating virtual machines via the network file system protocol. This network attached storage is also connected via optical fibers to our university’s core router.</p>
        </sec>
      </sec>
      <sec>
        <title>Evaluation Measures for Focused Web Crawling</title>
        <p>Several studies state that the primary metric in evaluating focused crawler performance is the harvest rate [<xref ref-type="bibr" rid="ref20">20</xref>,<xref ref-type="bibr" rid="ref29">29</xref>,<xref ref-type="bibr" rid="ref31">31</xref>,<xref ref-type="bibr" rid="ref49">49</xref>,<xref ref-type="bibr" rid="ref50">50</xref>]. Harvest rate is defined as “the fraction of webpages crawled that satisfy the relevance criteria among all crawled webpages” [<xref ref-type="bibr" rid="ref20">20</xref>]. Previous studies reported that the harvest rate ranges between 10% and 45% for such systems [<xref ref-type="bibr" rid="ref24">24</xref>,<xref ref-type="bibr" rid="ref28">28</xref>,<xref ref-type="bibr" rid="ref31">31</xref>,<xref ref-type="bibr" rid="ref51">51</xref>].</p>
        <p>In addition, the recall (also known as sensitivity) measure can be estimated by using the seed-target approach [<xref ref-type="bibr" rid="ref29">29</xref>,<xref ref-type="bibr" rid="ref52">52</xref>,<xref ref-type="bibr" rid="ref53">53</xref>]. In this context, the initial set of seed pages is split into two sets of which one can be used as seeds and the other as targets (T). <xref rid="figure3" ref-type="fig">Figure 3</xref> depicts the relationship between relevant (R), crawled (S) and target web pages.</p>
        <p>According to Liu [<xref ref-type="bibr" rid="ref29">29</xref>], the recall may be estimated if T is a representative, unbiased sample of R independent of the crawling process by the equation in <xref rid="figure4" ref-type="fig">Figure 4</xref> at any time t.</p>
        <fig id="figure3" position="float">
          <label>Figure 3</label>
          <caption>
            <p>Relationship between target, relevant, and crawled web pages. Recall is estimated based on known relevant target pages and underlying independence assumption.</p>
          </caption>
          <graphic xlink:href="jmir_v22i7e17853_fig3.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <fig id="figure4" position="float">
          <label>Figure 4</label>
          <caption>
            <p>Recall estimate equation.</p>
          </caption>
          <graphic xlink:href="jmir_v22i7e17853_fig4.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Text Classification</title>
        <p>Support vector machines (SVMs) originate from the field of machine learning and are known to perform well for text classification tasks [<xref ref-type="bibr" rid="ref44">44</xref>,<xref ref-type="bibr" rid="ref45">45</xref>]. For this reason, we relied on an SVM to determine a web page’s relevance within the health domain. <xref rid="figure5" ref-type="fig">Figure 5</xref> depicts the system’s workflow for the training and classification phase.</p>
        <p>To build our SVM-based text classifier, we followed related methods as described by Joachims [<xref ref-type="bibr" rid="ref44">44</xref>]: as a first step toward a health text classifier, automatically gathered health-related articles (contained in a document collection [D]) were cleaned from syntactic markup (eg, boilerplate code, HTML tags). Each article was then tokenized (ie, split into single word fragments) and each character was converted to lower case (also known as case folding). Stop words (eg, the, and, it) were removed as these kinds of tokens do not carry any relevant information. Next, stemming techniques were applied in order to map tokens to their stem forms and reduce morphological variations of words (eg, goes becomes go). Each article was transformed into a document vector containing all distinct terms. To do so, it is necessary to compute the terms that are representative for every article. A so-called feature selection produces a smaller subset of features (F) which yields the most relevant features for each article, limited by a predetermined threshold [<xref ref-type="bibr" rid="ref54">54</xref>]. Given D and F, an SVM was trained to distinguish between vectors of health-related (H) and non–health-related (G) articles. The resulting classifier may be applied to previously unclassified web pages in order to predict their health-relatedness. To evaluate the classifier’s quality, we used well-established metrics from the field of information retrieval such as accuracy, recall, and precision [<xref ref-type="bibr" rid="ref29">29</xref>].</p>
        <p>LIBSVM [<xref ref-type="bibr" rid="ref55">55</xref>] and its object-oriented binding zlibsvm [<xref ref-type="bibr" rid="ref56">56</xref>] were used as an SVM implementation of the text classifier. For building and training the SVM, the process described by Joachims [<xref ref-type="bibr" rid="ref44">44</xref>] was applied. To reduce dimensionality, the feature selection method <italic>information gain</italic> was used [<xref ref-type="bibr" rid="ref54">54</xref>]. Word embedding was conducted using <italic>tfc</italic> [<xref ref-type="bibr" rid="ref57">57</xref>] as a term-weighting approach.</p>
        <p>To find an optimal hyperparameter combination for the chosen radial basis function kernel, a grid search using 10-fold cross-validation, as recommended by the LIBSVM authors [<xref ref-type="bibr" rid="ref55">55</xref>], was conducted. According to the Pareto Principle, training and test data were constructed using an 80:20 split [<xref ref-type="bibr" rid="ref58">58</xref>]. In addition, the classes inside these data sets were equally balanced according to Wei and Dunbrack [<xref ref-type="bibr" rid="ref59">59</xref>] as the real-world class distribution of H and G is unknown.</p>
        <fig id="figure5" position="float">
          <label>Figure 5</label>
          <caption>
            <p>Workflow of an support vector machine–based text classification system: black lines indicate the training process; blue lines indicate the classification process; slanted boxes represent data; rectangular boxes represent computational steps.</p>
          </caption>
          <graphic xlink:href="jmir_v22i7e17853_fig5.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Graph Metrics</title>
        <p>The graph structure of the web has been extensively analyzed in several studies [<xref ref-type="bibr" rid="ref15">15</xref>,<xref ref-type="bibr" rid="ref17">17</xref>,<xref ref-type="bibr" rid="ref60">60</xref>,<xref ref-type="bibr" rid="ref61">61</xref>]. In this context, a graph node represents a web page and an edge represents a link between two web pages. We generated a so-called host-aggregated graph from the original web graph in order to reduce its computational complexity and explore its properties [<xref ref-type="bibr" rid="ref61">61</xref>]. In this process, single web pages are combined and represented by their parent web site (including outgoing and ingoing links). On the resulting host-aggregated graph, we applied the following metrics:</p>
        <list list-type="bullet">
          <list-item>
            <p>Average degree is the average number of edges connected to a node [<xref ref-type="bibr" rid="ref62">62</xref>]. For a (directed) web graph, this is defined as the total number of edges divided by the total number of nodes. The average in-degree denotes the average number of ingoing edges to a node (ie, links to a web site). The average out-degree is defined as the average number of outgoing edges of a node (ie, links targeting other web sites).</p>
          </list-item>
          <list-item>
            <p>Modularity measures the strength of division of a graph into clusters or groups [<xref ref-type="bibr" rid="ref62">62</xref>,<xref ref-type="bibr" rid="ref63">63</xref>]. Graphs with a high modularity have dense connections between the web sites within certain clusters but sparse connection to other web sites, which are contained in different clusters.</p>
          </list-item>
          <list-item>
            <p>PageRank is a centrality-based metric that allows identification of web sites (nodes) of importance inside a graph [<xref ref-type="bibr" rid="ref64">64</xref>]. The underlying assumption is that an important or prestigious web site will receive more links from other important web sites (ie, higher in-degree).</p>
          </list-item>
        </list>
        <p>Other metrics such as network diameter and the average path length (ie, the average number of clicks which will lead from one web site to another) are frequently used for graph analysis [<xref ref-type="bibr" rid="ref62">62</xref>,<xref ref-type="bibr" rid="ref65">65</xref>].</p>
      </sec>
      <sec>
        <title>Data Acquisition</title>
        <sec>
          <title>Seed Generation</title>
          <p>The selection of seed sources is crucial for the performance of a focused crawler [<xref ref-type="bibr" rid="ref24">24</xref>,<xref ref-type="bibr" rid="ref28">28</xref>,<xref ref-type="bibr" rid="ref66">66</xref>-<xref ref-type="bibr" rid="ref68">68</xref>]. For certain top-level domains (TLDs; eg, .com), the domain name system zone files are available to the public free of charge containing all registered domains for the given TLD. These zone files can then be used to extract seeds. However, due to data protection regulations, accessing and using the domain name system zone files for the ccTLDs .de, .at, and .ch was not possible.</p>
          <p>Other studies leverage search engines with specific queries [<xref ref-type="bibr" rid="ref28">28</xref>,<xref ref-type="bibr" rid="ref66">66</xref>,<xref ref-type="bibr" rid="ref69">69</xref>] to obtain high-quality seeds. However, most search engines restrict the amount of queries and limit the returned amount of results. Also, the results might be influenced by commercial interests and crafting high-quality search queries demands time and/or financial resources.</p>
          <p>Another widely used seed source is the web taxonomy Curlie [<xref ref-type="bibr" rid="ref22">22</xref>,<xref ref-type="bibr" rid="ref29">29</xref>,<xref ref-type="bibr" rid="ref31">31</xref>,<xref ref-type="bibr" rid="ref66">66</xref>,<xref ref-type="bibr" rid="ref70">70</xref>,<xref ref-type="bibr" rid="ref71">71</xref>], which provides human-maintained precategorized web sites. Seeds can be harvested as dumps and are available free of charge. In addition, it is possible to reuse the results of a previous crawl to generate seeds. For this study, we relied on Curlie and the data of a previous health-related crawl conducted in 2016 [<xref ref-type="bibr" rid="ref72">72</xref>].</p>
        </sec>
        <sec>
          <title>Machine Learning Data Sets</title>
          <sec>
            <title>Training and Test Corpus</title>
            <p>To obtain a large enough data set for training and testing the SVM text classifier used within the focused crawler, web pages from various German content providers were obtained. First, the web pages were downloaded by specialized web crawlers implemented in Java using the crawler4j framework [<xref ref-type="bibr" rid="ref73">73</xref>]. Next, boilerplate detection and data cleaning were conducted using regular expression filters. After this step, the cleaned textual content was stored in a relational database for further processing. Regarding each content provider, a random sample was manually inspected by the authors in order to assess data quality.</p>
            <p>Each content provider and all related articles were put into one of the two classes: health-related language (H) or general language (G). The coding was based on (1) the organizations providing the content, (2) health-related content certification (eg, Health On the Net Foundation Code of Conduct), and (3) a manual inspection by the authors, in which the topic relevance of a random sample for each content provider was assessed.</p>
          </sec>
          <sec>
            <title>Crowd-Validated Test Corpus</title>
            <p>As the training and test corpus were generated by using a priori knowledge of each content provider, the authors decided to construct an additional independent human-validated data set to evaluate the classifier’s performance.</p>
            <p>Recent studies have shown that crowdsourcing can produce comparable results to human experts at a faster pace [<xref ref-type="bibr" rid="ref74">74</xref>-<xref ref-type="bibr" rid="ref78">78</xref>]. Thus, crowdsourcing was used to assess the evaluation data set. <xref rid="figure6" ref-type="fig">Figure 6</xref> depicts the process of building this validated data set.</p>
            <p>First, web pages were manually selected from a crawl conducted in 2016 [<xref ref-type="bibr" rid="ref72">72</xref>]. It was ensured that the selected pages were neither included in the training set nor in the test corpus generated in the previous step. Next, each web page was assessed by a group of workers and categorized as H or G. Raters were given clear instructions on how to categorize given web pages (see <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). In addition, each rater successfully completed a quiz-based training before they could participate in the study [<xref ref-type="bibr" rid="ref79">79</xref>-<xref ref-type="bibr" rid="ref83">83</xref>]. Precategorized web pages (test questions) were mixed into the rating process as test questions to keep the attention of the raters at a high level.</p>
            <p>If a rater failed to answer a specific amount of such test questions, the assessments of this rater were considered as dropouts. Following the recommendation by Carvalho et al [<xref ref-type="bibr" rid="ref84">84</xref>], each web page was assessed by at least 10 crowd-workers on the commercial crowd-working platform FigureEight [<xref ref-type="bibr" rid="ref85">85</xref>]. In addition, the same web pages were coded by final year medical students (at least two students per web page) at the University of Heidelberg in the context of the lecture Medical Informatics. Study participation was voluntary.</p>
            <p>If there was no clear majority vote for a certain class between the crowd-workers, the assessments of the medical students were taken into consideration. If there was still no agreement, the web page was listed as a dropout.</p>
            <p>The statistical software R version 3.4.4 (R Foundation for Statistical Computing) on an Ubuntu 18.04 LTS 64-bit computer was used to compute percent agreement [<xref ref-type="bibr" rid="ref86">86</xref>] and Fleiss κ [<xref ref-type="bibr" rid="ref87">87</xref>].</p>
            <fig id="figure6" position="float">
              <label>Figure 6</label>
              <caption>
                <p>Workflow of the crowd-sourcing approach to build a test corpus for the purpose of classifier evaluation. Black lines indicate the assessment process; slanted boxes represent data; rectangular boxes represent processing steps.</p>
              </caption>
              <graphic xlink:href="jmir_v22i7e17853_fig6.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
            </fig>
          </sec>
        </sec>
      </sec>
    </sec>
    <sec sec-type="results">
      <title>Results</title>
      <sec>
        <title>Seeds</title>
        <p>Seeds were obtained from the health category (German: Gesundheit) of Curlie and a health-related crawl conducted in 2016 targeting health-related web sites in German [<xref ref-type="bibr" rid="ref72">72</xref>]. In total, n=22,405 seeds with ccTLDs .de (19,126/22,405, 85.36%), .at (1530/22,405, 6.83%), .ch (1749/22,405, 7.81%) were collected and used in this study. The full list of seeds can be found in <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>.</p>
      </sec>
      <sec>
        <title>Machine Learning Data Sets</title>
        <sec>
          <title>Data Set Characteristics</title>
          <p>Web pages from various German content providers were collected between April 24, 2018, and August 16, 2018. A detailed list and description of each content provider is shown in <xref ref-type="table" rid="table1">Table 1</xref>. In total, 98,442 articles were collected. The average word count for each document was 741; the average sentence count was 44.</p>
          <p>For category H, we collected 9638 articles from the categories “medicine” and “medical report” from Deutsches Ärzteblatt (a magazine tailored to physicians) and 1907 from Apotheken Umschau (a magazine freely available in German pharmacies, tailored to lay citizens). In addition, we acquired 235 and 636 articles from the medical content providers Institute for Quality and Efficiency in Healthcare and Onmeda, respectively. Moreover, 2829 documents were obtained from the national health portal of the Republic of Austria. In addition, 28,436 health-related articles from Wikipedia Health were gathered by using the Wikipedia category graph. For category G, 18,364 random articles from Wikipedia General were collected, which were not related to the category Health (German: Gesundheit). In addition, 36,297 German web pages were selected randomly from the Common Crawl Foundation.</p>
          <table-wrap position="float" id="table1">
            <label>Table 1</label>
            <caption>
              <p>Total number of acquired articles and respective class labels of various German content providers.</p>
            </caption>
            <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
              <col width="200"/>
              <col width="60"/>
              <col width="50"/>
              <col width="240"/>
              <col width="80"/>
              <col width="110"/>
              <col width="130"/>
              <col width="130"/>
              <thead>
                <tr valign="top">
                  <td>Content provider</td>
                  <td>Class</td>
                  <td>Cert<sup>a</sup></td>
                  <td>Organization</td>
                  <td>Articles</td>
                  <td>Words (mean)</td>
                  <td>Words (median)</td>
                  <td>Sentences (mean)</td>
                </tr>
              </thead>
              <tbody>
                <tr valign="top">
                  <td>Wikipedia Health</td>
                  <td>H<sup>b</sup></td>
                  <td>no</td>
                  <td>Wikimedia Foundation</td>
                  <td>28,436</td>
                  <td>429</td>
                  <td>254</td>
                  <td>31</td>
                </tr>
                <tr valign="top">
                  <td>Wikipedia General</td>
                  <td>G<sup>c</sup></td>
                  <td>no</td>
                  <td>Wikimedia Foundation</td>
                  <td>18,364</td>
                  <td>736</td>
                  <td>266</td>
                  <td>26</td>
                </tr>
                <tr valign="top">
                  <td>Common Crawl</td>
                  <td>G</td>
                  <td>no</td>
                  <td>Common Crawl Foundation</td>
                  <td>36,297</td>
                  <td>480</td>
                  <td>429</td>
                  <td>33</td>
                </tr>
                <tr valign="top">
                  <td>Deutsches Ärzteblatt</td>
                  <td>H</td>
                  <td>no</td>
                  <td>German Medical Association, National Association of Statutory Health Insurance Physicians</td>
                  <td>9638</td>
                  <td>1852</td>
                  <td>520</td>
                  <td>136</td>
                </tr>
                <tr valign="top">
                  <td>Onmeda</td>
                  <td>H</td>
                  <td>yes</td>
                  <td>Gofeminin.de GmbH</td>
                  <td>636</td>
                  <td>6564</td>
                  <td>6113</td>
                  <td>439</td>
                </tr>
                <tr valign="top">
                  <td>gesundheitsinformation.de</td>
                  <td>H</td>
                  <td>yes</td>
                  <td>Institute for Quality and Efficiency in Healthcare</td>
                  <td>235</td>
                  <td>1923</td>
                  <td>1799</td>
                  <td>139</td>
                </tr>
                <tr valign="top">
                  <td>Apotheken Umschau</td>
                  <td>H</td>
                  <td>yes</td>
                  <td>Wort &#38; Bild Verlag</td>
                  <td>1907</td>
                  <td>1052</td>
                  <td>658</td>
                  <td>73</td>
                </tr>
                <tr valign="top">
                  <td>GESUNDheit.gv.at</td>
                  <td>H</td>
                  <td>no</td>
                  <td>Ministry of Social Affairs (Austria)</td>
                  <td>2929</td>
                  <td>295</td>
                  <td>221</td>
                  <td>21</td>
                </tr>
                <tr valign="top">
                  <td>Total</td>
                  <td>—<sup>d</sup></td>
                  <td>—</td>
                  <td>—</td>
                  <td>98,442</td>
                  <td>741</td>
                  <td>339</td>
                  <td>44</td>
                </tr>
              </tbody>
            </table>
            <table-wrap-foot>
              <fn id="table1fn1">
                <p><sup>a</sup>Yes indicates that a provider is certified by the Health On The Net Foundation Code of Conduct or another certification provider.</p>
              </fn>
              <fn id="table1fn2">
                <p><sup>b</sup>H: health-related language.</p>
              </fn>
              <fn id="table1fn3">
                <p><sup>c</sup>G: general language.</p>
              </fn>
              <fn id="table1fn4">
                <p><sup>d</sup>Not applicable.</p>
              </fn>
            </table-wrap-foot>
          </table-wrap>
        </sec>
        <sec>
          <title>Training and Test Corpus</title>
          <p>For training and evaluation of the SVM classifier, 87,562 articles were used. <xref ref-type="table" rid="table2">Table 2</xref> lists the final data sets. In total, 80.00% (70,048/87,562) of articles were used for training the classifier and 20.00% (17,514 of 87,562) were used for testing.</p>
          <table-wrap position="float" id="table2">
            <label>Table 2</label>
            <caption>
              <p>Total amount of articles used in the training and test corpus per content provider with corresponding class labels: health-related language (H) and general language (G).</p>
            </caption>
            <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
              <col width="480"/>
              <col width="110"/>
              <col width="130"/>
              <col width="130"/>
              <col width="150"/>
              <thead>
                <tr valign="top">
                  <td>Content provider</td>
                  <td>Class</td>
                  <td colspan="3">Documents</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>
                    <break/>
                  </td>
                  <td>Training</td>
                  <td>Test</td>
                  <td>Total</td>
                </tr>
              </thead>
              <tbody>
                <tr valign="top">
                  <td>Wikipedia</td>
                  <td>H<sup>a</sup></td>
                  <td>22,748</td>
                  <td>5688</td>
                  <td>28,436</td>
                </tr>
                <tr valign="top">
                  <td>Wikipedia</td>
                  <td>G<sup>b</sup></td>
                  <td>10,339</td>
                  <td>2585</td>
                  <td>12,924</td>
                </tr>
                <tr valign="top">
                  <td>Common Crawl</td>
                  <td>G</td>
                  <td>24,685</td>
                  <td>6172</td>
                  <td>30,857</td>
                </tr>
                <tr valign="top">
                  <td>Deutsches Ärzteblatt</td>
                  <td>H</td>
                  <td>7710</td>
                  <td>1928</td>
                  <td>9638</td>
                </tr>
                <tr valign="top">
                  <td>Onmeda</td>
                  <td>H</td>
                  <td>509</td>
                  <td>127</td>
                  <td>636</td>
                </tr>
                <tr valign="top">
                  <td>gesundheitsinformation.de</td>
                  <td>H</td>
                  <td>189</td>
                  <td>46</td>
                  <td>235</td>
                </tr>
                <tr valign="top">
                  <td>Apotheken Umschau</td>
                  <td>H</td>
                  <td>1525</td>
                  <td>382</td>
                  <td>1907</td>
                </tr>
                <tr valign="top">
                  <td>GESUNDheit.gv.at</td>
                  <td>H</td>
                  <td>2343</td>
                  <td>586</td>
                  <td>2929</td>
                </tr>
                <tr valign="top">
                  <td>Total</td>
                  <td>–<sup>c</sup></td>
                  <td>70,048</td>
                  <td>17,514</td>
                  <td>87,562</td>
                </tr>
              </tbody>
            </table>
            <table-wrap-foot>
              <fn id="table2fn1">
                <p><sup>a</sup>H: health-related language.</p>
              </fn>
              <fn id="table2fn2">
                <p><sup>b</sup>G: general language.</p>
              </fn>
              <fn id="table2fn3">
                <p><sup>c</sup>Not applicable.</p>
              </fn>
            </table-wrap-foot>
          </table-wrap>
        </sec>
        <sec>
          <title>Crowd-Validated Test Corpus</title>
          <p>A total of 432 web pages (216 per class) were manually selected from a health-related crawl conducted in 2016 [<xref ref-type="bibr" rid="ref72">72</xref>]. The selected web pages were neither contained in the training nor in the test corpus (see <xref ref-type="table" rid="table2">Table 2</xref>).</p>
          <p>Each web page was assessed by 10 crowd-workers between February 2, 2019, and February 16, 2019, on the commercial crowd-working platform, FigureEight [<xref ref-type="bibr" rid="ref85">85</xref>]. In total, 4367 assessments by 28 crowd-workers were collected at a cost of US $36.06. The overall satisfaction (as measured by FigureEight) was 4.45 out of 5 possible points (instructions clear: 4.5/5; test questions fair: 4.55/5; ease of job: 4.5/5; payment: 3.65/5); 14 out of 28 (50%) workers participated in this voluntary exit survey. Percent agreement was 0.855; Fleiss κ was 0.279.</p>
          <p>In addition, the same web pages were coded by medical students (n=40). Study participation was voluntary. Each web page was assessed by at least two students. Percent agreement was 0.719; Fleiss κ was 0.337. According to Landis and Koch [<xref ref-type="bibr" rid="ref88">88</xref>], these κ values correspond to a fair agreement.</p>
          <p>The resulting data set contained n=384 web pages (192 per class). This corresponds to a dropout rate of 11.1% (48/432). The full list of coded web pages is given in <xref ref-type="supplementary-material" rid="app3">Multimedia Appendix 3</xref>.</p>
        </sec>
      </sec>
      <sec>
        <title>Classifier Performance</title>
        <p>The classifier was evaluated against the test and crowd-validated data set, and the results are presented in <xref ref-type="table" rid="table3">Table 3</xref>. The classifier achieved a precision of 0.934, a recall of 0.940, and an accuracy of 0.937 on its test data set; 5.96% (522/8757) of health-related web pages were falsely classified as nonrelevant by the SVM. On the other hand, 6.57% (575/8757) of the nonrelevant pages were classified as health-related.</p>
        <p>On the crowd-validated real-world data set, the classifier achieved an accuracy of 0.966, a precision of 0.954, and a recall of 0.989. Only 1.0% (2/192) of the health-related web pages were falsely classified as nonrelevant, and 5.7% (11/192) of nonrelevant web pages were classified as health-related.</p>
        <table-wrap position="float" id="table3">
          <label>Table 3</label>
          <caption>
            <p>Listing of the confusion matrix and related evaluation metrics for the test and crowd-validated data set.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="30"/>
            <col width="350"/>
            <col width="90"/>
            <col width="100"/>
            <col width="100"/>
            <col width="130"/>
            <col width="100"/>
            <col width="100"/>
            <thead>
              <tr valign="top">
                <td colspan="2">Evaluation data sets</td>
                <td colspan="6">Baseline</td>
              </tr>
              <tr valign="top">
                <td colspan="2">
                  <break/>
                </td>
                <td>Health</td>
                <td>General</td>
                <td>Sum</td>
                <td>Accuracy</td>
                <td>Precision</td>
                <td>Recall</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td colspan="2">
                  <bold>Test data set</bold>
                </td>
                <td>
                  <break/>
                </td>
                <td>
                  <break/>
                </td>
                <td>
                  <break/>
                </td>
                <td>0.937</td>
                <td>0.934</td>
                <td>0.94</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>SVM<sup>a</sup></td>
                <td>—<sup>b</sup></td>
                <td>—</td>
                <td>—</td>
                <td>—</td>
                <td>—</td>
                <td>—</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Health</td>
                <td>8182</td>
                <td>575</td>
                <td>8757</td>
                <td>—</td>
                <td>—</td>
                <td>—</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>General</td>
                <td>522</td>
                <td>8235</td>
                <td>8757</td>
                <td>—</td>
                <td>—</td>
                <td>—</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Sum</td>
                <td>8704</td>
                <td>8810</td>
                <td>17,514</td>
                <td>—</td>
                <td>—</td>
                <td>—</td>
              </tr>
              <tr valign="top">
                <td colspan="2">
                  <bold>Crowd-validated data set</bold>
                </td>
                <td>
                  <break/>
                </td>
                <td>
                  <break/>
                </td>
                <td>
                  <break/>
                </td>
                <td>0.966</td>
                <td>0.954</td>
                <td>0.989</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>SVM</td>
                <td>—</td>
                <td>—</td>
                <td>—</td>
                <td>—</td>
                <td>—</td>
                <td>—</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Health</td>
                <td>181</td>
                <td>11</td>
                <td>192</td>
                <td>—</td>
                <td>—</td>
                <td>—</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>General</td>
                <td>2</td>
                <td>190</td>
                <td>192</td>
                <td>—</td>
                <td>—</td>
                <td>—</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Sum</td>
                <td>183</td>
                <td>211</td>
                <td>384</td>
                <td>—</td>
                <td>—</td>
                <td>—</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table3fn1">
              <p><sup>a</sup>SVM: support vector machine.</p>
            </fn>
            <fn id="table3fn2">
              <p><sup>b</sup>Not applicable.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
      </sec>
      <sec>
        <title>Crawler Performance</title>
        <p>Our system achieved a download rate of 7 to 10 documents per second. This sums up to 227 days of pure crawling and classification of approximately 133 million web pages.</p>
        <p>The crawl yielded approximately 13.5 million presumably relevant web pages and approximately 119.5 million nonrelevant web pages. <xref rid="figure7" ref-type="fig">Figure 7</xref> depicts the harvest rate during the crawl.</p>
        <p>The overall mean harvest rate was 19.76% (HR<sub><italic>t</italic> =222</sub>=HR<sub>max</sub>=36.45%; HR<sub><italic>t</italic> =53</sub>=HR<sub>min</sub>=0.00%). HR<sub>max</sub> was achieved at day 222 as the crawl was resumed after infrastructure maintenance due to urgent security updates; HR<sub>min</sub> was recorded on day 53. It was caused by a data center outage in which the infrastructure had to be shut down.</p>
        <p>As an additional measure, we estimated the recall of our focused crawling by using the seed-target approach [<xref ref-type="bibr" rid="ref29">29</xref>]. For this purpose, the initial seed set (n=22,405) was divided into a set of seeds (n=17,405) and targets (n=5000); ccTLD distribution was maintained in each sub set, and 4105 out of 5000 targets (82.10%) were contained in the crawl. This corresponds to an estimated recall of 0.821.</p>
        <fig id="figure7" position="float">
          <label>Figure 7</label>
          <caption>
            <p>Harvest rate over time measured at the end of each day (dashed line represents the mean harvest rate). Note that the drop at day 53 is related to an outage at our data center. Peak at day 106: storm cluster was extended by three additional virtual machines. Peaks at days 157, 158, 191, 194 and 222: crawl was resumed after infrastructure maintenance due to urgent security updates that required a restart of the host system and/or of the virtual machines.</p>
          </caption>
          <graphic xlink:href="jmir_v22i7e17853_fig7.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Graph Structure</title>
        <p>The graph database Neo4J in v3.5.4 and its graph algorithm plugins [<xref ref-type="bibr" rid="ref89">89</xref>] were used to compute the metrics as described in Graph Metrics on an Ubuntu 18.04 LTS 64-bit server. In order to reduce graph complexity, all web pages belonging to the same web site were aggregated and substituted by their parent web site (including outgoing and ingoing links; see Graph Metrics). The resulting graph contains 215,372 nodes (web sites) and 403,175 edges (links between web sites). A total of 82.56% (177,816/215,372) of the web sites belong to the ccTLD .de; 7.95% (17,126/215,372) to .at, and 9.49% (20,430/215,372) to .ch.</p>
        <p>The graph has a network diameter of 25. The average path length is 6.466. The average degree is 1.872, the average in-degree is 1.892, and the average out-degree is 1.845. Modularity was computed to be 0.723.</p>
        <p>During the analysis, several types of website publishers emerged: public institutions, nonprofit organizations, and private organizations or single individuals. As the ccTLD .de has the highest share within the graph, a global ranking according to PageRank would be dominated by .de web sites. For this reason, the following paragraph will present the top 25 web sites according to PageRank for each ccTLD separately.</p>
        <p><xref ref-type="table" rid="table4">Table 4</xref> lists the 25 top-ranked web sites according to PageRank with their respective publisher for .de; 12 out of 25 (48%) are published by public institutions, 32% (8/25) are published by nonprofit organizations, and 20% (5/25) by private organizations. The top-ranked 25 web sites for .at are shown in <xref ref-type="table" rid="table5">Table 5</xref>; 12 out of 25 (48%) are published by public institutions, 4% (1/25) are published by nonprofit organizations, and 48% (12/25) by private organizations (see <xref ref-type="table" rid="table5">Table 5</xref>). For the ccTLD .ch, 24% (6/25) are published by public institutions, 40% (10/25) originate from nonprofit organizations, and 9/25 (36%) are published by private organizations (see <xref ref-type="table" rid="table6">Table 6</xref>).</p>
        <p>Overall, 40% (30/75) are web sites published by public institutions, 25% (19/75) are published by nonprofit organizations, and 35% (26/75) by private organizations.</p>
        <p>The graph visualization tool Gephi v0.9.2 [<xref ref-type="bibr" rid="ref90">90</xref>] was used on a bare-metal Windows 10 64-bit computer to explore the host-aggregated graph structure. Unfortunately, we experienced serious performance issues while running Gephi’s visualization algorithms. This is a main reason why we illustrate just a small example extract of the host-aggregated graph: <xref rid="figure8" ref-type="fig">Figure 8</xref> consists of 94 nodes and 243 edges and presents basic aspects of the graph’s structure. The focus is on www.rki.de as the top-ranked web site for the ccTLD .de (according to our analysis from below). The surrounding nodes represent health-related web sites in close proximity of www.rki.de.</p>
        <table-wrap position="float" id="table4">
          <label>Table 4</label>
          <caption>
            <p>Domains of 25 top-ranked web sites for country-code top-level domain .de with their respective publisher according to PageRank.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="60"/>
            <col width="320"/>
            <col width="560"/>
            <col width="60"/>
            <thead>
              <tr valign="top">
                <td>Rank</td>
                <td>Domain</td>
                <td>Publisher</td>
                <td>Type</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>1</td>
                <td>www.rki.de</td>
                <td>Robert Koch Institute</td>
                <td>PI<sup>a</sup></td>
              </tr>
              <tr valign="top">
                <td>2</td>
                <td>www.aerzteblatt.de</td>
                <td>Deutscher Ärzte-Verlag GmbH</td>
                <td>PI</td>
              </tr>
              <tr valign="top">
                <td>3</td>
                <td>www.charite.de</td>
                <td>Charité–Berlin University of Medicine</td>
                <td>PI</td>
              </tr>
              <tr valign="top">
                <td>4</td>
                <td>www.deutsche-alzheimer.de</td>
                <td>Deutsche Alzheimer Gesellschaft</td>
                <td>NPO<sup>b</sup></td>
              </tr>
              <tr valign="top">
                <td>5</td>
                <td>www.aerztezeitung.de</td>
                <td>Springer Medizin Verlag GmbH</td>
                <td>PO<sup>c</sup></td>
              </tr>
              <tr valign="top">
                <td>6</td>
                <td>www.dge.de</td>
                <td>Deutsche Gesellschaft für Ernährung</td>
                <td>NPO</td>
              </tr>
              <tr valign="top">
                <td>7</td>
                <td>www.g-ba.de</td>
                <td>Gemeinsamer Bundesausschuss (Federal Joint Comitee)</td>
                <td>PI</td>
              </tr>
              <tr valign="top">
                <td>8</td>
                <td>www.bzga.de</td>
                <td>Bundeszentrale für gesundheitliche Aufklärung (Federal Centre for Health Education)</td>
                <td>PI</td>
              </tr>
              <tr valign="top">
                <td>9</td>
                <td>www.bundesgesundheitsministerium.de</td>
                <td>Bundesministerium für Gesundheit (Federal Ministry of Health)</td>
                <td>PI</td>
              </tr>
              <tr valign="top">
                <td>10</td>
                <td>www.apotheken-umschau.de</td>
                <td>Wort &#38; Bild Verlag</td>
                <td>PO</td>
              </tr>
              <tr valign="top">
                <td>11</td>
                <td>www.dimdi.de</td>
                <td>Deutsches Institut für Medizinische Dokumentation und Information (German Institute for Medical Documentation and Information)</td>
                <td>PI</td>
              </tr>
              <tr valign="top">
                <td>12</td>
                <td>www.gesundheitsinformation.de</td>
                <td>Institut für Qualität und Wirtschaftlichkeit im Gesundheitswesen (Institute for Quality and Efficiency in Healthcare)</td>
                <td>PI</td>
              </tr>
              <tr valign="top">
                <td>13</td>
                <td>www.osteopathie.de</td>
                <td>Verband der Osteopathen Deutschland eV</td>
                <td>NPO</td>
              </tr>
              <tr valign="top">
                <td>14</td>
                <td>www.krebsgesellschaft.de</td>
                <td>Deutsche Krebsgesellschaft eV</td>
                <td>NPO</td>
              </tr>
              <tr valign="top">
                <td>15</td>
                <td>www.bfarm.de</td>
                <td>Bundesinstitut für Arzneimittel und Medizinprodukte (Federal Institute for Drugs and Medical Devices)</td>
                <td>PI</td>
              </tr>
              <tr valign="top">
                <td>16</td>
                <td>www.kbv.de</td>
                <td>Kassenärztliche Bundesvereinigung</td>
                <td>PI</td>
              </tr>
              <tr valign="top">
                <td>17</td>
                <td>www.krebshilfe.de</td>
                <td>Stiftung Deutsche Krebshilfe</td>
                <td>NPO</td>
              </tr>
              <tr valign="top">
                <td>18</td>
                <td>www.tk.de</td>
                <td>Techniker Krankenkasse (Health Insurance)</td>
                <td>PO</td>
              </tr>
              <tr valign="top">
                <td>19</td>
                <td>www.ebm-netzwerk.de</td>
                <td>Deutsches Netzwerk Evidenzbasierte Medizin eV</td>
                <td>NPO</td>
              </tr>
              <tr valign="top">
                <td>20</td>
                <td>www.bmg.bund.de</td>
                <td>Bundesministerium für Gesundheit (Federal Ministry of Health)</td>
                <td>PI</td>
              </tr>
              <tr valign="top">
                <td>21</td>
                <td>www.netdoktor.de</td>
                <td>NetDoktor.de GmbH</td>
                <td>PO</td>
              </tr>
              <tr valign="top">
                <td>22</td>
                <td>www.drk.de</td>
                <td>Deutsches Rotes Kreuz eV (German Red Cross)</td>
                <td>NPO</td>
              </tr>
              <tr valign="top">
                <td>23</td>
                <td>www.herzstiftung.de</td>
                <td>Deutsche Herzstiftung</td>
                <td>NPO</td>
              </tr>
              <tr valign="top">
                <td>24</td>
                <td>www.klinikum.uni-heidelberg.de</td>
                <td>Universitätsklinikum Heidelberg</td>
                <td>PI</td>
              </tr>
              <tr valign="top">
                <td>25</td>
                <td>www.aok.de</td>
                <td>AOK Gesundheiskasse (Health Insurance)</td>
                <td>PO</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table4fn1">
              <p><sup>a</sup>PI: public institution.</p>
            </fn>
            <fn id="table4fn2">
              <p><sup>b</sup>NPO: nonprofit organization.</p>
            </fn>
            <fn id="table4fn3">
              <p><sup>c</sup>PO: private organization.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <table-wrap position="float" id="table5">
          <label>Table 5</label>
          <caption>
            <p>Domains of 25 top-ranked web sites for country-code top-level domain .at with their respective publisher according to PageRank.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="60"/>
            <col width="280"/>
            <col width="600"/>
            <col width="60"/>
            <thead>
              <tr valign="top">
                <td>Rank</td>
                <td>Domain</td>
                <td>Publisher</td>
                <td>Type</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>1</td>
                <td>www.gesundheit.gv.at</td>
                <td>Bundesministerium für Arbeit, Soziales, Gesundheit und Konsumentenschutz (Ministry of Social Affairs)</td>
                <td>PI<sup>a</sup></td>
              </tr>
              <tr valign="top">
                <td>2</td>
                <td>www.meduniwien.ac.at</td>
                <td>University of Vienna</td>
                <td>PI</td>
              </tr>
              <tr valign="top">
                <td>3</td>
                <td>www.bmgf.gv.at</td>
                <td>Bundesministerium für Arbeit, Soziales, Gesundheit und Konsumentenschutz (Ministry of Social Affairs)</td>
                <td>PI</td>
              </tr>
              <tr valign="top">
                <td>4</td>
                <td>www.sozialministerium.at</td>
                <td>Bundesministerium für Arbeit, Soziales, Gesundheit und Konsumentenschutz (Ministry of Social Affairs)</td>
                <td>PI</td>
              </tr>
              <tr valign="top">
                <td>5</td>
                <td>www.apotheker.or.at</td>
                <td>Österreichische Apothekenkammer (Austrian Pharmaceutical Association)</td>
                <td>PI</td>
              </tr>
              <tr valign="top">
                <td>6</td>
                <td>www.sam-pharma.at</td>
                <td>Pharma Handel GmbH</td>
                <td>PO<sup>b</sup></td>
              </tr>
              <tr valign="top">
                <td>7</td>
                <td>www.aerztekammer.at</td>
                <td>Österreichische Ärztekammer (Austrian Medical Association)</td>
                <td>PI</td>
              </tr>
              <tr valign="top">
                <td>8</td>
                <td>www.univie.ac.at</td>
                <td>University of Vienna</td>
                <td>PI</td>
              </tr>
              <tr valign="top">
                <td>9</td>
                <td>www.herz-ambulatorium.at</td>
                <td>Individual Person</td>
                <td>PO</td>
              </tr>
              <tr valign="top">
                <td>10</td>
                <td>www.herz-ordination.at</td>
                <td>Individual Person</td>
                <td>PO</td>
              </tr>
              <tr valign="top">
                <td>11</td>
                <td>www.tg-steiermark.at</td>
                <td>TG Therapeutische Gemeinschaft Betriebs GmbH</td>
                <td>NPO<sup>c</sup></td>
              </tr>
              <tr valign="top">
                <td>12</td>
                <td>www.impuls-fs.at</td>
                <td>Institut für medizinisch-physiotherapeutische Untersuchung, Lehre und Schulung</td>
                <td>PO</td>
              </tr>
              <tr valign="top">
                <td>13</td>
                <td>www.medunigraz.at</td>
                <td>University of Graz</td>
                <td>PI</td>
              </tr>
              <tr valign="top">
                <td>14</td>
                <td>www.brustvergroesserung-leicht.at</td>
                <td>Individual Person</td>
                <td>PO</td>
              </tr>
              <tr valign="top">
                <td>15</td>
                <td>www.bmg.gv.at</td>
                <td>Bundesministerium für Arbeit, Soziales, Gesundheit und Konsumentenschutz (Ministry of Social Affairs)</td>
                <td>PI</td>
              </tr>
              <tr valign="top">
                <td>16</td>
                <td>www.kages.at</td>
                <td>Steiermärkische Krankenanstaltengesellschaft mbH</td>
                <td>PO</td>
              </tr>
              <tr valign="top">
                <td>17</td>
                <td>science.orf.at</td>
                <td>Österreichischer Rundfunk (Austrian Broadcasting Corporation)</td>
                <td>PI</td>
              </tr>
              <tr valign="top">
                <td>18</td>
                <td>www.gynmed.at</td>
                <td>Individual Person</td>
                <td>PO</td>
              </tr>
              <tr valign="top">
                <td>19</td>
                <td>www.fhstp.ac.at</td>
                <td>St. Pölten University of Applied Sciences</td>
                <td>PI</td>
              </tr>
              <tr valign="top">
                <td>20</td>
                <td>www.dr-boehm.at</td>
                <td>Individual Person</td>
                <td>PO</td>
              </tr>
              <tr valign="top">
                <td>21</td>
                <td>bmg.gv.at</td>
                <td>Bundesministerium für Arbeit, Soziales, Gesundheit und Konsumentenschutz (Ministry of Social Affairs)</td>
                <td>PI</td>
              </tr>
              <tr valign="top">
                <td>22</td>
                <td>www.novartis.at</td>
                <td>Novartis AG</td>
                <td>PO</td>
              </tr>
              <tr valign="top">
                <td>23</td>
                <td>www.babyforum.at</td>
                <td>FOKUS KIND Medien, CRAFT &#38; VALUE</td>
                <td>PO</td>
              </tr>
              <tr valign="top">
                <td>24</td>
                <td>femmestyle.at</td>
                <td>Schönheitschirurgie femmestyle</td>
                <td>PO</td>
              </tr>
              <tr valign="top">
                <td>25</td>
                <td>www.pfizer.at</td>
                <td>Pfizer Inc</td>
                <td>PO</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table5fn1">
              <p><sup>a</sup>PI: public institution.</p>
            </fn>
            <fn id="table5fn2">
              <p><sup>b</sup>PO: private organization.</p>
            </fn>
            <fn id="table5fn3">
              <p><sup>c</sup>NPO: nonprofit organization.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <table-wrap position="float" id="table6">
          <label>Table 6</label>
          <caption>
            <p>Domains of 25 top-ranked web sites for country-code top-level domain .ch with their respective publisher according to PageRank.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="60"/>
            <col width="300"/>
            <col width="580"/>
            <col width="60"/>
            <thead>
              <tr valign="top">
                <td>Rank</td>
                <td>Domain</td>
                <td>Publisher</td>
                <td>Type</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>1</td>
                <td>www.uzh.ch</td>
                <td>University of Zurich</td>
                <td>PI<sup>a</sup></td>
              </tr>
              <tr valign="top">
                <td>2</td>
                <td>www.usz.ch</td>
                <td>Universitätsspital Zürich</td>
                <td>PI</td>
              </tr>
              <tr valign="top">
                <td>3</td>
                <td>www.srf.ch</td>
                <td>Schweizerische Radio- und Fernsehgesellschaft (Swiss Broadcasting Corporation)</td>
                <td>PI</td>
              </tr>
              <tr valign="top">
                <td>4</td>
                <td>www.netdoktor.ch</td>
                <td>netdoktor GmbH</td>
                <td>PO<sup>b</sup></td>
              </tr>
              <tr valign="top">
                <td>5</td>
                <td>www.pancreas-help.ch</td>
                <td>Schweizer Selbsthilfeorganisation Pankreaserkrankungen</td>
                <td>NPO<sup>c</sup></td>
              </tr>
              <tr valign="top">
                <td>6</td>
                <td>www.mutterglueck.ch</td>
                <td>Individual Person</td>
                <td>PO</td>
              </tr>
              <tr valign="top">
                <td>7</td>
                <td>www.association-osteo-swiss.ch</td>
                <td>Schweizerischer Verband der Osteopathen</td>
                <td>NPO</td>
              </tr>
              <tr valign="top">
                <td>8</td>
                <td>www.unibas.ch</td>
                <td>University of Basel</td>
                <td>PI</td>
              </tr>
              <tr valign="top">
                <td>9</td>
                <td>www.ethz.ch</td>
                <td>ETH Zurich (Swiss Federal Institute of Technology in Zurich)</td>
                <td>PI</td>
              </tr>
              <tr valign="top">
                <td>10</td>
                <td>www.rheumaliga.ch</td>
                <td>Rheumaliga Schweiz</td>
                <td>NPO</td>
              </tr>
              <tr valign="top">
                <td>11</td>
                <td>www.lungenliga.ch</td>
                <td>Lungenliga Schweiz</td>
                <td>NPO</td>
              </tr>
              <tr valign="top">
                <td>12</td>
                <td>www.rotpunkt-apotheken.ch</td>
                <td>Rotpunkt-Pharma AG</td>
                <td>PO</td>
              </tr>
              <tr valign="top">
                <td>13</td>
                <td>www.pharmawiki.ch</td>
                <td>PharmaWiki GmbH</td>
                <td>PO</td>
              </tr>
              <tr valign="top">
                <td>14</td>
                <td>www.bayer.ch</td>
                <td>Bayer AG</td>
                <td>PO</td>
              </tr>
              <tr valign="top">
                <td>15</td>
                <td>www.patientensicherheit.ch</td>
                <td>Stiftung Patientensicherheit Schweiz</td>
                <td>NPO</td>
              </tr>
              <tr valign="top">
                <td>16</td>
                <td>saez.ch</td>
                <td>EMH Schweizerischer Ärzteverlag AG</td>
                <td>NPO</td>
              </tr>
              <tr valign="top">
                <td>17</td>
                <td>www.swissheart.ch</td>
                <td>Schweizerische Herzstiftung</td>
                <td>NPO</td>
              </tr>
              <tr valign="top">
                <td>18</td>
                <td>gesundheitsfoerderung.ch</td>
                <td>Gesundheitsförderung Schweiz</td>
                <td>NPO</td>
              </tr>
              <tr valign="top">
                <td>19</td>
                <td>sensomotorische-lebensweisen.ch</td>
                <td>Individual Person</td>
                <td>PO</td>
              </tr>
              <tr valign="top">
                <td>20</td>
                <td>www.spitaluster.ch</td>
                <td>Spital User</td>
                <td>PO</td>
              </tr>
              <tr valign="top">
                <td>21</td>
                <td>symptome.ch</td>
                <td>NOXA GmbH</td>
                <td>PO</td>
              </tr>
              <tr valign="top">
                <td>22</td>
                <td>www.meineimpfungen.ch</td>
                <td>Stiftung meineimpfungen</td>
                <td>NPO</td>
              </tr>
              <tr valign="top">
                <td>23</td>
                <td>unicef.ch</td>
                <td>United Nations International Children's Emergency Fund</td>
                <td>NPO</td>
              </tr>
              <tr valign="top">
                <td>24</td>
                <td>www.bauchtumor.ch</td>
                <td>Universitätsspital Bern</td>
                <td>PI</td>
              </tr>
              <tr valign="top">
                <td>25</td>
                <td>www.fettabsaugungen.ch</td>
                <td>FSnD Ltd</td>
                <td>PO</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table6fn1">
              <p><sup>a</sup>PI: public institution.</p>
            </fn>
            <fn id="table6fn2">
              <p><sup>b</sup>PO: private organization.</p>
            </fn>
            <fn id="table6fn3">
              <p><sup>c</sup>NPO: nonprofit organization.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <fig id="figure8" position="float">
          <label>Figure 8</label>
          <caption>
            <p>A small extract of the host-aggregated web graph with focus on the website www.rki.de. The surrounding nodes represent websites with a maximum link-distance of two starting from www.rki.de. An edge between two nodes implies that there exists at least one hyperlink between some web pages of the hosting websites in either way. Only those websites are included whose content is highly health-related (ie, which were automatically classified as belonging to H with a probability equal to or greater than 0.93). Moreover, they have at least one ingoing and one outgoing link. The bigger a node and its caption, the higher is its page rank. For illustration reasons, directional arrows were not included.</p>
          </caption>
          <graphic xlink:href="jmir_v22i7e17853_fig8.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
    </sec>
    <sec sec-type="discussion">
      <title>Discussion</title>
      <sec>
        <title>Principal Findings</title>
        <p>One aim of the study was to demonstrate the suitability of a focused crawler approach for the acquisition of health-related content. Our system achieved an average harvest rate of 19.76% during the entire crawl. In addition, the results show that the majority of the target seeds (4105/5000) could be obtained, which corresponds to a recall of 0.821. Therefore, we are confident that the proposed method is suitable to acquire most health-related content on the web and generate a suitable domain-specific graph representation.</p>
        <p>A first manual investigation of several hundred randomly selected pages suggests that our approach produces accurate results. The results indicate that the web sites and web pages of major German, Austrian, and Swiss health-related public institutions have indeed been discovered, even though they were not contained in the initial seeds.</p>
        <p>With respect to the study aims 2 and 3, we were able to provide a curated list of 22,405 seed points for the health domain in the D-A-CH region extracted from Curlie (see <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>). In addition, a data set with 396 items was created and evaluated by crowd-workers that can be used by other researchers to evaluate similar text classifiers (see <xref ref-type="supplementary-material" rid="app3">Multimedia Appendix 3</xref>).</p>
        <p>A first analysis of the graph structure (see study aim 4) shows that public institutions and nonprofit organizations have a higher importance according to their PageRank than web sites of private players inside the GHW.</p>
      </sec>
      <sec>
        <title>Limitations</title>
        <p>Several limitations apply for this study. First, we are not sure whether the seed pages cover a broad spectrum of topics within the health domain as we only acquired seeds from Curlie and a previous health-related crawl [<xref ref-type="bibr" rid="ref72">72</xref>]. Using specifically crafted queries against established search engines would have increased the amount of available seeds and could have influenced the crawl in a positive way [<xref ref-type="bibr" rid="ref28">28</xref>,<xref ref-type="bibr" rid="ref66">66</xref>,<xref ref-type="bibr" rid="ref69">69</xref>]. However, due to limited amounts of resources and time, we did not follow this approach as the web taxonomy Curlie and a previous crawl gave faster access to seed URLs. As Curlie is a community-driven web taxonomy, the publication process of new URLs is not strictly regulated. This might be a reason for the high share of private players within the top ranks of the web graph as everybody is eligible to publish a web site’s URL on Curlie. In addition, the community behind Curlie is rather small compared with its predecessors (ie, URLs pointing to rather new content providers might not be contained in it). Therefore, corresponding web pages and their out-links might have been missed during the crawling process. This implies that reported graph properties might have been influenced by the chosen seed sources.</p>
        <p>Second, with a mean accuracy of 0.951, our classifier might have produced false positive results during the crawl process. Third, we only considered the ccTLDs .de, .at, and .ch to avoid the need for a language classification system, as most web sites on these ccTLDs are written in German. Therefore, the data crawled covers only a certain fraction of the GHW, for example, as web sites in German published under .org are not contained.</p>
      </sec>
      <sec>
        <title>Comparison With Prior Work</title>
        <p>Previous studies investigated the use of focused crawler techniques to harvest biomedical or health-related text material [<xref ref-type="bibr" rid="ref27">27</xref>,<xref ref-type="bibr" rid="ref28">28</xref>]. In both analyses, the authors report that the use of focused crawlers requires a lot of computational effort to collect the data and analyze it in an appropriate way, which we can confirm by our observations.</p>
        <p>Compared with the study by Rheinländer et al [<xref ref-type="bibr" rid="ref28">28</xref>] in which they report an harvest rate of 38%, our system achieved an harvest rate of only 19.76%. This might be caused by (1) our system using a soft-focused crawling strategy meant it did not stop at the first encountered irrelevant web page, leading to an increase in irrelevant web pages and crawling time and (2) our crawl was limited to the ccTLDs .de, .at, and .ch as we did not implement a language classifier. This might have influenced the harvest rate of our system as well, yet it achieved a harvest rate in the typical range for such systems [<xref ref-type="bibr" rid="ref24">24</xref>,<xref ref-type="bibr" rid="ref28">28</xref>,<xref ref-type="bibr" rid="ref31">31</xref>,<xref ref-type="bibr" rid="ref51">51</xref>] (see Related Work).</p>
        <p>In contrast to the studies by Rheinländer et al [<xref ref-type="bibr" rid="ref28">28</xref>] and Amalia et al [<xref ref-type="bibr" rid="ref27">27</xref>], we focused on the German language and the GHW. This study contributes to the field by demonstrating the suitability of a focused crawler approach for the acquisition of German health-related content in the D-A-CH region. A secondary study outcome is a curated list of seed points for the health domain in the D-A-CH region (see <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>). In addition, the crowd-validated evaluation data set (see <xref ref-type="supplementary-material" rid="app3">Multimedia Appendix 3</xref>) can be used to evaluate other text classifiers for the given purpose. Moreover, this study gives first insights regarding the graph structure of the health-related web in the D-A-CH region.</p>
      </sec>
      <sec>
        <title>Conclusions and Further Research</title>
        <p>In this study, a system was presented which uses a focused crawling approach to gather the structure of the GHW. The system used an SVM-based classifier that was trained to assess the relevance of a web page for the health domain. The results indicate that the presented focused crawler is a suitable method for acquiring large health-related textual datasets and can be used to generate domain-specific graph representations. In future work, the authors intend to expand their web crawl by leveraging seed lists generated via search engine providers.</p>
        <p>We also plan to analyze the linguistic characteristics of the crawled data as well as identify important topics and trends within this data. This will also include the identification of credible content providers and a comparison of the health-related web between Germany, Austria, and Switzerland. Moreover, future work will include a deeper exploration and analysis as well as a visualization of the resulting graph structure. Using these insights and with the acquired data available, an implementation and evaluation of a health-specific search engine for information seeking citizens will be possible.</p>
      </sec>
    </sec>
  </body>
  <back>
    <app-group>
      <supplementary-material id="app1">
        <label>Multimedia Appendix 1</label>
        <p>Instruction for raters written in German.</p>
        <media xlink:href="jmir_v22i7e17853_app1.docx" xlink:title="DOCX File , 17 KB"/>
      </supplementary-material>
      <supplementary-material id="app2">
        <label>Multimedia Appendix 2</label>
        <p>List of seed points.</p>
        <media xlink:href="jmir_v22i7e17853_app2.xlsx" xlink:title="XLSX File  (Microsoft Excel File), 484 KB"/>
      </supplementary-material>
      <supplementary-material id="app3">
        <label>Multimedia Appendix 3</label>
        <p>Crowd-annotated corpus of health-related and non–health-related web pages.</p>
        <media xlink:href="jmir_v22i7e17853_app3.xlsx" xlink:title="XLSX File  (Microsoft Excel File), 26 KB"/>
      </supplementary-material>
    </app-group>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">ccTLD</term>
          <def>
            <p>country-code top-level domain</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb2">D-A-CH</term>
          <def>
            <p>Germany, Austria, and Switzerland</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb3">GHW</term>
          <def>
            <p>German Health Web</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb4">SVM</term>
          <def>
            <p>support vector machine</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb5">TLD</term>
          <def>
            <p>top-level domain</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <ack>
      <p>The authors would like to thank Dr Monika Pobiruchin and Martin Wiesner of Heilbronn University for their feedback and valuable input to the work.</p>
    </ack>
    <fn-group>
      <fn fn-type="conflict">
        <p>None declared.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Cline</surname>
              <given-names>RJ</given-names>
            </name>
            <name name-style="western">
              <surname>Haynes</surname>
              <given-names>KM</given-names>
            </name>
          </person-group>
          <article-title>Consumer health information seeking on the internet: the state of the art</article-title>
          <source>Health Educ Res</source>
          <year>2001</year>
          <month>12</month>
          <volume>16</volume>
          <issue>6</issue>
          <fpage>671</fpage>
          <lpage>692</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://her.oxfordjournals.org/cgi/pmidlookup?view=long&#38;pmid=11780707"/>
          </comment>
          <pub-id pub-id-type="medline">11780707</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Eysenbach</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Köhler</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>How do consumers search for and appraise health information on the world wide web? Qualitative study using focus groups, usability tests, and in-depth interviews</article-title>
          <source>BMJ</source>
          <year>2002</year>
          <month>03</month>
          <day>9</day>
          <volume>324</volume>
          <issue>7337</issue>
          <fpage>573</fpage>
          <lpage>577</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/11884321"/>
          </comment>
          <pub-id pub-id-type="medline">11884321</pub-id>
          <pub-id pub-id-type="pmcid">PMC78994</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Andreassen</surname>
              <given-names>HK</given-names>
            </name>
            <name name-style="western">
              <surname>Bujnowska-Fedak</surname>
              <given-names>MM</given-names>
            </name>
            <name name-style="western">
              <surname>Chronaki</surname>
              <given-names>CE</given-names>
            </name>
            <name name-style="western">
              <surname>Dumitru</surname>
              <given-names>RC</given-names>
            </name>
            <name name-style="western">
              <surname>Pudule</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Santana</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Voss</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Wynn</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>European citizens' use of E-health services: a study of seven countries</article-title>
          <source>BMC Public Health</source>
          <year>2007</year>
          <volume>7</volume>
          <fpage>53</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://www.biomedcentral.com/1471-2458/7/53"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/1471-2458-7-53</pub-id>
          <pub-id pub-id-type="medline">17425798</pub-id>
          <pub-id pub-id-type="pii">1471-2458-7-53</pub-id>
          <pub-id pub-id-type="pmcid">PMC1855923</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Tu</surname>
              <given-names>HT</given-names>
            </name>
            <name name-style="western">
              <surname>Cohen</surname>
              <given-names>GR</given-names>
            </name>
          </person-group>
          <article-title>Striking jump in consumers seeking health care information</article-title>
          <source>Track Rep</source>
          <year>2008</year>
          <month>08</month>
          <issue>20</issue>
          <fpage>1</fpage>
          <lpage>8</lpage>
          <pub-id pub-id-type="medline">18770913</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Fox</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Duggan</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <source>Health Online 2013</source>
          <year>2013</year>
          <month>01</month>
          <day>15</day>
          <access-date>2020-06-08</access-date>
          <publisher-loc>Washington</publisher-loc>
          <publisher-name>Pew Internet and American Life Project</publisher-name>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.pewresearch.org/internet/wp-content/uploads/sites/9/media/Files/Reports/PIP_HealthOnline.pdf">https://www.pewresearch.org/internet/wp-content/uploads/sites/9/media/Files/Reports/PIP_HealthOnline.pdf</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref6">
        <label>6</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Prestin</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Vieux</surname>
              <given-names>SN</given-names>
            </name>
            <name name-style="western">
              <surname>Chou</surname>
              <given-names>WS</given-names>
            </name>
          </person-group>
          <article-title>Is online health activity alive and well or flatlining? Findings from 10 years of the Health Information National Trends Survey</article-title>
          <source>J Health Commun</source>
          <year>2015</year>
          <month>07</month>
          <volume>20</volume>
          <issue>7</issue>
          <fpage>790</fpage>
          <lpage>798</lpage>
          <pub-id pub-id-type="doi">10.1080/10810730.2015.1018590</pub-id>
          <pub-id pub-id-type="medline">26042588</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref7">
        <label>7</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Jacobs</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Amuta</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Jeon</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>Health information seeking in the digital age: an analysis of health information seeking behavior among US adults</article-title>
          <source>Cogent Soc Sci</source>
          <year>2017</year>
          <month>3</month>
          <day>13</day>
          <volume>3</volume>
          <issue>1</issue>
          <fpage>442</fpage>
          <lpage>448</lpage>
          <pub-id pub-id-type="doi">10.1080/23311886.2017.1302785</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref8">
        <label>8</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Sbaffi</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Rowley</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Trust and credibility in web-based health information: a review and agenda for future research</article-title>
          <source>J Med Internet Res</source>
          <year>2017</year>
          <month>06</month>
          <day>19</day>
          <volume>19</volume>
          <issue>6</issue>
          <fpage>e218</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://www.jmir.org/2017/6/e218/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/jmir.7579</pub-id>
          <pub-id pub-id-type="medline">28630033</pub-id>
          <pub-id pub-id-type="pii">v19i6e218</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref9">
        <label>9</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wong</surname>
              <given-names>DK</given-names>
            </name>
            <name name-style="western">
              <surname>Cheung</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Online health information seeking and ehealth literacy among patients attending a primary care clinic in Hong Kong: a cross-sectional survey</article-title>
          <source>J Med Internet Res</source>
          <year>2019</year>
          <month>03</month>
          <day>27</day>
          <volume>21</volume>
          <issue>3</issue>
          <fpage>e10831</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://www.jmir.org/2019/3/e10831/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/10831</pub-id>
          <pub-id pub-id-type="medline">30916666</pub-id>
          <pub-id pub-id-type="pii">v21i3e10831</pub-id>
          <pub-id pub-id-type="pmcid">PMC6456826</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref10">
        <label>10</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wetter</surname>
              <given-names>T</given-names>
            </name>
          </person-group>
          <source>Consumer Health Informatics New Services, Roles, and Responsibilities</source>
          <year>2016</year>
          <publisher-loc>Cham</publisher-loc>
          <publisher-name>Springer International Publishing</publisher-name>
          <fpage>978</fpage>
          <lpage>983</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref11">
        <label>11</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <collab>Working Group GPGI</collab>
          </person-group>
          <article-title>Good practice guidelines for health information</article-title>
          <source>Zeitschrift für Evidenz, Fortbildung und Qualität im Gesundheitswesen</source>
          <year>2016</year>
          <volume>110-111</volume>
          <fpage>e1</fpage>
          <lpage>e8</lpage>
          <pub-id pub-id-type="doi">10.1016/j.zefq.2016.01.004</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref12">
        <label>12</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Feldwisch-Drentrup</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Kuhrt</surname>
              <given-names>N</given-names>
            </name>
          </person-group>
          <source>Schlechte und gefährliche Gesundheitsinformationen: Wie sie erkannt und Patienten besser geschützt werden können Bad and harmful health information: How to identify such information to better protect patients</source>
          <year>2019</year>
          <access-date>2020-06-08</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.bertelsmann-stiftung.de/fileadmin/files/BSt/Publikationen/GrauePublikationen/VV_Analyse_Gefaehrliche_Gesundheitsinfos_final.pdf">https://www.bertelsmann-stiftung.de/fileadmin/files/BSt/Publikationen/GrauePublikationen/VV_Analyse_Gefaehrliche_Gesundheitsinfos_final.pdf</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref13">
        <label>13</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Silberg</surname>
              <given-names>WM</given-names>
            </name>
            <name name-style="western">
              <surname>Lundberg</surname>
              <given-names>GD</given-names>
            </name>
            <name name-style="western">
              <surname>Musacchio</surname>
              <given-names>RA</given-names>
            </name>
          </person-group>
          <article-title>Assessing, controlling, and assuring the quality of medical information on the Internet: caveant lector et viewor—let the reader and viewer beware</article-title>
          <source>JAMA</source>
          <year>1997</year>
          <month>04</month>
          <day>16</day>
          <volume>277</volume>
          <issue>15</issue>
          <fpage>1244</fpage>
          <lpage>1245</lpage>
          <pub-id pub-id-type="medline">9103351</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref14">
        <label>14</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>van den Bosch</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Bogers</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>de Kunder</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Estimating search engine index size variability: a 9-year longitudinal study</article-title>
          <source>Scientometrics</source>
          <year>2016</year>
          <volume>107</volume>
          <fpage>839</fpage>
          <lpage>856</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/27122648"/>
          </comment>
          <pub-id pub-id-type="doi">10.1007/s11192-016-1863-z</pub-id>
          <pub-id pub-id-type="medline">27122648</pub-id>
          <pub-id pub-id-type="pii">1863</pub-id>
          <pub-id pub-id-type="pmcid">PMC4833824</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref15">
        <label>15</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Broder</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Kumar</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Maghoul</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Raghavan</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Rajagopalan</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Stata</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Tomkins</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Wiener</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Graph structure in the Web</article-title>
          <source>Computer Networks</source>
          <year>2000</year>
          <month>06</month>
          <volume>33</volume>
          <issue>1-6</issue>
          <fpage>309</fpage>
          <lpage>320</lpage>
          <pub-id pub-id-type="doi">10.1016/s1389-1286(00)00083-9</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref16">
        <label>16</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Meusel</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Mika</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Blanco</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>Focused crawling for structured data</article-title>
          <source>Proc 23rd ACM Int Conf Inf Knowl Manag</source>
          <year>2014</year>
          <fpage>1039</fpage>
          <lpage>1048</lpage>
          <pub-id pub-id-type="doi">10.1145/2661829.2661902</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref17">
        <label>17</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Meusel</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>The Graph Structure in the Web – Analyzed on Different Aggregation Levels</article-title>
          <source>J Web Sci</source>
          <year>2015</year>
          <month>06</month>
          <day>24</day>
          <volume>1</volume>
          <issue>1</issue>
          <fpage>33</fpage>
          <lpage>47</lpage>
          <pub-id pub-id-type="doi">10.1561/106.00000003</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref18">
        <label>18</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Yu</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Han</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Xue</surname>
              <given-names>G</given-names>
            </name>
          </person-group>
          <article-title>China Web Graph Measurements and Evolution</article-title>
          <source>Web Technologies Research and Development</source>
          <year>2005</year>
          <publisher-loc>Berlin</publisher-loc>
          <publisher-name>Springer</publisher-name>
          <fpage>668</fpage>
          <lpage>679</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref19">
        <label>19</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Baggio</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>The web graph of a tourism system</article-title>
          <source>Physica A Stat Mech Appl</source>
          <year>2007</year>
          <month>6</month>
          <volume>379</volume>
          <issue>2</issue>
          <fpage>727</fpage>
          <lpage>734</lpage>
          <pub-id pub-id-type="doi">10.1016/j.physa.2007.01.008</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref20">
        <label>20</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kumar</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Bhatia</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Rattan</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>A survey of Web crawlers for information retrieval</article-title>
          <source>WIREs Data Mining Knowl Discov</source>
          <year>2017</year>
          <month>08</month>
          <day>07</day>
          <volume>7</volume>
          <issue>6</issue>
          <fpage>e1218</fpage>
          <pub-id pub-id-type="doi">10.1002/widm.1218</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref21">
        <label>21</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>LaValley</surname>
              <given-names>SA</given-names>
            </name>
            <name name-style="western">
              <surname>Kiviniemi</surname>
              <given-names>MT</given-names>
            </name>
            <name name-style="western">
              <surname>Gage-Bouchard</surname>
              <given-names>EA</given-names>
            </name>
          </person-group>
          <article-title>Where people look for online health information</article-title>
          <source>Health Info Libr J</source>
          <year>2016</year>
          <month>05</month>
          <day>21</day>
          <fpage>146</fpage>
          <lpage>155</lpage>
          <pub-id pub-id-type="doi">10.1111/hir.12143</pub-id>
          <pub-id pub-id-type="medline">27207817</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref22">
        <label>22</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Tang</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Hawking</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Craswell</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Griffiths</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>Focused crawling for both topical relevance and quality of medical information</article-title>
          <source>Proc 14th ACM Int Conf Inf Knowl Manag</source>
          <year>2005</year>
          <fpage>147</fpage>
          <lpage>154</lpage>
          <pub-id pub-id-type="doi">10.1145/1099554.1099583</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref23">
        <label>23</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Pirkola</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <source>Focused crawling: a means to acquire biological data from the web</source>
          <year>2007</year>
          <access-date>2020-06-08</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.87.6495&#38;rep=rep1&#38;type=pdf">http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.87.6495&#38;rep=rep1&#38;type=pdf</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref24">
        <label>24</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Abbasi</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Fu</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Zeng</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Adjeroh</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>Crawling credible online medical sentiments for social intelligence</article-title>
          <year>2013</year>
          <conf-name>International Conference on Social Computing</conf-name>
          <conf-date>2013</conf-date>
          <conf-loc>Alexandria</conf-loc>
          <fpage>254</fpage>
          <lpage>263</lpage>
          <pub-id pub-id-type="doi">10.1109/socialcom.2013.43</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref25">
        <label>25</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Abbasi</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Zhao</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Abraham</surname>
              <given-names>B</given-names>
            </name>
          </person-group>
          <person-group person-group-type="editor">
            <name name-style="western">
              <surname>Parsons</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>A Prototype System for Collecting and Analyzing Credible Online Medical Content</article-title>
          <source>Tackling Society’s Grand Challenge With Design Science. DESRIST 2016 Lecture Notes in Computer Science</source>
          <year>2016</year>
          <publisher-loc>Cham</publisher-loc>
          <publisher-name>Springer</publisher-name>
          <fpage>197</fpage>
          <lpage>201</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref26">
        <label>26</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Xu</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Yoon</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Tourassi</surname>
              <given-names>G</given-names>
            </name>
          </person-group>
          <article-title>A user-oriented web crawler for selectively acquiring online content in e-health research</article-title>
          <source>Bioinformatics</source>
          <year>2014</year>
          <month>01</month>
          <day>01</day>
          <volume>30</volume>
          <issue>1</issue>
          <fpage>104</fpage>
          <lpage>114</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/24078710"/>
          </comment>
          <pub-id pub-id-type="doi">10.1093/bioinformatics/btt571</pub-id>
          <pub-id pub-id-type="medline">24078710</pub-id>
          <pub-id pub-id-type="pii">btt571</pub-id>
          <pub-id pub-id-type="pmcid">PMC3866553</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref27">
        <label>27</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Amalia</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Gunawan</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Najwan</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Meirina</surname>
              <given-names>F</given-names>
            </name>
          </person-group>
          <article-title>Focused crawler for the acquisition of health articles</article-title>
          <year>2016</year>
          <conf-name>International Conference on Data and Software Engineering</conf-name>
          <conf-date>2016</conf-date>
          <conf-loc>Denpasar</conf-loc>
          <fpage>1</fpage>
          <lpage>6</lpage>
          <pub-id pub-id-type="doi">10.1109/icodse.2016.7936110</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref28">
        <label>28</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Rheinländer</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Lehmann</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Kunkel</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Meier</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Leser</surname>
              <given-names>U</given-names>
            </name>
          </person-group>
          <article-title>Potential and Pitfalls of Domain-Specific Information Extraction at Web Scale</article-title>
          <source>Proc 2016 Int Conf Manag Data</source>
          <year>2016</year>
          <fpage>759</fpage>
          <lpage>771</lpage>
          <pub-id pub-id-type="doi">10.1145/2882903.2903736</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref29">
        <label>29</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>B</given-names>
            </name>
          </person-group>
          <source>Web Data Mining: Exploring Hyperlinks Contents and Usage Data, 2nd Edition</source>
          <year>2011</year>
          <publisher-loc>Berlin</publisher-loc>
          <publisher-name>Springer</publisher-name>
        </nlm-citation>
      </ref>
      <ref id="ref30">
        <label>30</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Brin</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Page</surname>
              <given-names>L</given-names>
            </name>
          </person-group>
          <article-title>Reprint of: The anatomy of a large-scale hypertextual web search engine</article-title>
          <source>Comput Netw</source>
          <year>2012</year>
          <month>12</month>
          <volume>56</volume>
          <issue>18</issue>
          <fpage>3825</fpage>
          <lpage>3833</lpage>
          <pub-id pub-id-type="doi">10.1016/j.comnet.2012.10.007</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref31">
        <label>31</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Chakrabarti</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>van den Berg</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Dom</surname>
              <given-names>B</given-names>
            </name>
          </person-group>
          <article-title>Focused crawling: a new approach to topic-specific Web resource discovery</article-title>
          <source>Comput Netw</source>
          <year>1999</year>
          <month>5</month>
          <volume>31</volume>
          <issue>11-16</issue>
          <fpage>1623</fpage>
          <lpage>1640</lpage>
          <pub-id pub-id-type="doi">10.1016/s1389-1286(99)00052-3</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref32">
        <label>32</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Davison</surname>
              <given-names>B</given-names>
            </name>
          </person-group>
          <article-title>Topical locality in the Web</article-title>
          <year>2000</year>
          <conf-name>Proc 23rd Ann Int ACM SIGIR Conf Res Devel Inf Retrieval</conf-name>
          <conf-date>2000</conf-date>
          <conf-loc>Athens</conf-loc>
          <fpage>272</fpage>
          <lpage>279</lpage>
          <pub-id pub-id-type="doi">10.1145/345508.345597</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref33">
        <label>33</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Shkapenyuk</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Suel</surname>
              <given-names>T</given-names>
            </name>
          </person-group>
          <article-title>Design and implementation of a high-performance distributed Web crawler</article-title>
          <year>2002</year>
          <conf-name>18th Int Conf Data Eng</conf-name>
          <conf-date>2002</conf-date>
          <conf-loc>San Jose</conf-loc>
          <fpage>357</fpage>
          <lpage>368</lpage>
          <pub-id pub-id-type="doi">10.1109/icde.2002.994750</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref34">
        <label>34</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Eichmann</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>Ethical web agents</article-title>
          <source>Comput Netw and ISDN Syst</source>
          <year>1995</year>
          <month>12</month>
          <volume>28</volume>
          <issue>1-2</issue>
          <fpage>127</fpage>
          <lpage>136</lpage>
          <pub-id pub-id-type="doi">10.1016/0169-7552(95)00107-3</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref35">
        <label>35</label>
        <nlm-citation citation-type="web">
          <article-title>A method for web robots control</article-title>
          <source>Network Working Group</source>
          <year>1996</year>
          <access-date>2019-09-10</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.robotstxt.org/norobots-rfc.txt">https://www.robotstxt.org/norobots-rfc.txt</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref36">
        <label>36</label>
        <nlm-citation citation-type="web">
          <article-title>Robots.txt-Specifications</article-title>
          <source>Google Developers</source>
          <access-date>2019-09-10</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://developers.google.com/search/reference/robots_txt?hl=en">https://developers.google.com/search/reference/robots_txt?hl=en</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref37">
        <label>37</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Abiteboul</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Querying semi-structured data</article-title>
          <source>Database Theory</source>
          <year>1997</year>
          <publisher-loc>Berlin</publisher-loc>
          <publisher-name>Springer</publisher-name>
          <fpage>1</fpage>
          <lpage>18</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref38">
        <label>38</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ofuonye</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Beatty</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Dick</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Miller</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Prevalence and classification of web page defects</article-title>
          <source>Online Inf Rev</source>
          <year>2010</year>
          <month>02</month>
          <day>23</day>
          <volume>34</volume>
          <issue>1</issue>
          <fpage>160</fpage>
          <lpage>174</lpage>
          <pub-id pub-id-type="doi">10.1108/14684521011024182</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref39">
        <label>39</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Yi</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>X</given-names>
            </name>
          </person-group>
          <article-title>Eliminating noisy information in web pages for data mining</article-title>
          <source>Proc Ninth ACM SIGKDD Int Conf Knowl Discovery Data Mining</source>
          <year>2003</year>
          <fpage>296</fpage>
          <lpage>305</lpage>
          <pub-id pub-id-type="doi">10.1145/956750.956785</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref40">
        <label>40</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kohlschütter</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Fankhauser</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Nejdl</surname>
              <given-names>W</given-names>
            </name>
          </person-group>
          <article-title>Boilerplate detection using shallow text features</article-title>
          <source>Proc Third ACM Int Conf Web Search Data Mining</source>
          <year>2010</year>
          <fpage>441</fpage>
          <lpage>450</lpage>
          <pub-id pub-id-type="doi">10.1145/1718487.1718542</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref41">
        <label>41</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Nioche</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>StormCrawler: a collection of resources for building low-latency, scalable web crawlers on Apache Storm</article-title>
          <source>DigitalPebble Ltd</source>
          <year>2019</year>
          <access-date>2019-09-10</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://stormcrawler.net/">http://stormcrawler.net/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref42">
        <label>42</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Allen</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Jankowski</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Pathirana</surname>
              <given-names>P</given-names>
            </name>
          </person-group>
          <source>Storm Applied: Strategies for Real-Time Event Processing</source>
          <year>2015</year>
          <publisher-loc>Shelter Island</publisher-loc>
          <publisher-name>Manning Publications Co</publisher-name>
        </nlm-citation>
      </ref>
      <ref id="ref43">
        <label>43</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Gormley</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Tong</surname>
              <given-names>Z</given-names>
            </name>
          </person-group>
          <source>Elasticsearch: The Definitive Guide: A Distributed Real-Time Search and Analytics Engine</source>
          <year>2015</year>
          <publisher-loc>Sebastopol</publisher-loc>
          <publisher-name>O'Reilly Media</publisher-name>
        </nlm-citation>
      </ref>
      <ref id="ref44">
        <label>44</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Joachims</surname>
              <given-names>T</given-names>
            </name>
          </person-group>
          <article-title>Text categorization with support vector machines: learning with many relevant features</article-title>
          <source>Machine Learning: ECML 1998. Lecture Notes in Computer Science (Lecture Notes in Artificial Intelligence), vol 1398</source>
          <year>1998</year>
          <fpage>137</fpage>
          <lpage>142</lpage>
          <pub-id pub-id-type="doi">10.1007/bfb0026683</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref45">
        <label>45</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zowalla</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Wiesner</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Pfeifer</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>Automatically Assessing the Expert Degree of Online Health Content Using SVMs</article-title>
          <source>Stud Health Technol Inform</source>
          <year>2014</year>
          <volume>202</volume>
          <fpage>48</fpage>
          <lpage>51</lpage>
          <pub-id pub-id-type="medline">25000012</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref46">
        <label>46</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Cho</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Garcia-Molina</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Page</surname>
              <given-names>L</given-names>
            </name>
          </person-group>
          <article-title>Efficient crawling through URL ordering</article-title>
          <source>Comput Netw ISDN Syst</source>
          <year>1998</year>
          <month>4</month>
          <volume>30</volume>
          <issue>1-7</issue>
          <fpage>161</fpage>
          <lpage>172</lpage>
          <pub-id pub-id-type="doi">10.1016/s0169-7552(98)00108-1</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref47">
        <label>47</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Pinkerton</surname>
              <given-names>B</given-names>
            </name>
          </person-group>
          <article-title>WebCrawler: Finding What People Want [Dissertation]</article-title>
          <source>University of Washington</source>
          <year>2000</year>
          <access-date>2020-06-03</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://www.thinkpink.com/bp/Thesis/Thesis.pdf">http://www.thinkpink.com/bp/Thesis/Thesis.pdf</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref48">
        <label>48</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Rajalakshmi</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Aravindan</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>Web page classification using n-gram based URL features</article-title>
          <year>2013</year>
          <conf-name>Fifth Int Conf Adv Comput</conf-name>
          <conf-date>2013</conf-date>
          <conf-loc>Chennia</conf-loc>
          <publisher-loc>2013 Fifth International Conference on Advanced Computing (ICoAC) Chennai, India</publisher-loc>
          <publisher-name>IEEE</publisher-name>
          <fpage>15</fpage>
          <lpage>21</lpage>
          <pub-id pub-id-type="doi">10.1109/icoac.2013.6921920</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref49">
        <label>49</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Samarawickrama</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Jayaratne</surname>
              <given-names>L</given-names>
            </name>
          </person-group>
          <article-title>Automatic text classification and focused crawling</article-title>
          <year>2011</year>
          <conf-name>Sixth Int Conf Dig Inf Manag</conf-name>
          <conf-date>2011</conf-date>
          <conf-loc>Melbourne</conf-loc>
          <publisher-loc>2011 Sixth International Conference on Digital Information Management; Melbourne, Australia</publisher-loc>
          <publisher-name>IEEE</publisher-name>
          <fpage>143</fpage>
          <lpage>148</lpage>
          <pub-id pub-id-type="doi">10.1109/icdim.2011.6093329</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref50">
        <label>50</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Bedi</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Thukral</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Banati</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Behl</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Mendiratta</surname>
              <given-names>V</given-names>
            </name>
          </person-group>
          <article-title>A multi-threaded semantic focused crawler</article-title>
          <source>J Comput Sci Technol</source>
          <year>2012</year>
          <month>11</month>
          <day>15</day>
          <volume>27</volume>
          <issue>6</issue>
          <fpage>1233</fpage>
          <lpage>1242</lpage>
          <pub-id pub-id-type="doi">10.1007/s11390-012-1299-8</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref51">
        <label>51</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Pant</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Srinivasan</surname>
              <given-names>P</given-names>
            </name>
          </person-group>
          <article-title>Learning to crawl: comparing classification schemes</article-title>
          <source>ACM Trans Inf Syst</source>
          <year>2005</year>
          <month>10</month>
          <day>01</day>
          <volume>23</volume>
          <issue>4</issue>
          <fpage>430</fpage>
          <lpage>462</lpage>
          <pub-id pub-id-type="doi">10.1145/1095872.1095875</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref52">
        <label>52</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Menczer</surname>
              <given-names>F</given-names>
            </name>
          </person-group>
          <article-title>ARACHNID: adaptive retrieval agents choosing heuristic neighborhoods for information discovery</article-title>
          <source>Proc 14th Int Conf Mach Learn</source>
          <year>1997</year>
          <fpage>227</fpage>
          <lpage>235</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref53">
        <label>53</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Srinivasan</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Menczer</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Pant</surname>
              <given-names>G</given-names>
            </name>
          </person-group>
          <article-title>A general evaluation framework for topical crawlers</article-title>
          <source>Inf Retrieval</source>
          <year>2005</year>
          <month>1</month>
          <volume>8</volume>
          <issue>3</issue>
          <fpage>417</fpage>
          <lpage>447</lpage>
          <pub-id pub-id-type="doi">10.1007/s10791-005-6993-5</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref54">
        <label>54</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Pedersen</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>A comparative study on feature selection in text categorization</article-title>
          <source>Int Conf Mach Learn</source>
          <year>1997</year>
          <fpage>412</fpage>
          <lpage>420</lpage>
          <pub-id pub-id-type="doi">10.5555/645526.657137</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref55">
        <label>55</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Chang</surname>
              <given-names>CC</given-names>
            </name>
            <name name-style="western">
              <surname>Lin</surname>
              <given-names>CJ</given-names>
            </name>
          </person-group>
          <source>LIBSVM: a library for support vector machines</source>
          <access-date>2020-06-08</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.csie.ntu.edu.tw/~cjlin/papers/libsvm.pdf">https://www.csie.ntu.edu.tw/~cjlin/papers/libsvm.pdf</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref56">
        <label>56</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zowalla</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Wiesner</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <source>zlibsvm: an object-oriented Java-binding for support vector machines in the medical domain</source>
          <year>2017</year>
          <publisher-loc>Düsseldorf</publisher-loc>
          <publisher-name>German Medical Science GMS Publishing House</publisher-name>
        </nlm-citation>
      </ref>
      <ref id="ref57">
        <label>57</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Salton</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Buckley</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>Term-weighting approaches in automatic text retrieval</article-title>
          <source>Information Processing &#38; Management</source>
          <year>1988</year>
          <month>1</month>
          <volume>24</volume>
          <issue>5</issue>
          <fpage>513</fpage>
          <lpage>523</lpage>
          <pub-id pub-id-type="doi">10.1016/0306-4573(88)90021-0</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref58">
        <label>58</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Dobbin</surname>
              <given-names>KK</given-names>
            </name>
            <name name-style="western">
              <surname>Simon</surname>
              <given-names>RM</given-names>
            </name>
          </person-group>
          <article-title>Optimally splitting cases for training and testing high dimensional classifiers</article-title>
          <source>BMC Med Genomics</source>
          <year>2011</year>
          <month>04</month>
          <day>08</day>
          <volume>4</volume>
          <fpage>31</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://bmcmedgenomics.biomedcentral.com/articles/10.1186/1755-8794-4-31"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/1755-8794-4-31</pub-id>
          <pub-id pub-id-type="medline">21477282</pub-id>
          <pub-id pub-id-type="pii">1755-8794-4-31</pub-id>
          <pub-id pub-id-type="pmcid">PMC3090739</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref59">
        <label>59</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wei</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Dunbrack</surname>
              <given-names>RL</given-names>
            </name>
          </person-group>
          <article-title>The role of balanced training and testing data sets for binary classifiers in bioinformatics</article-title>
          <source>PLoS One</source>
          <year>2013</year>
          <volume>8</volume>
          <issue>7</issue>
          <fpage>e67863</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://dx.plos.org/10.1371/journal.pone.0067863"/>
          </comment>
          <pub-id pub-id-type="doi">10.1371/journal.pone.0067863</pub-id>
          <pub-id pub-id-type="medline">23874456</pub-id>
          <pub-id pub-id-type="pii">PONE-D-12-35411</pub-id>
          <pub-id pub-id-type="pmcid">PMC3706434</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref60">
        <label>60</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Meusel</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Vigna</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Lehmberg</surname>
              <given-names>O</given-names>
            </name>
            <name name-style="western">
              <surname>Bizer</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>Graph structure in the Web—revisited: a trick of the heavy tail</article-title>
          <source>WWW ’14 Companion: Proc 23rd Int Conf World Wide Web</source>
          <year>2014</year>
          <fpage>427</fpage>
          <lpage>432</lpage>
          <pub-id pub-id-type="doi">10.1145/2567948.2576928</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref61">
        <label>61</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lehmberg</surname>
              <given-names>O</given-names>
            </name>
            <name name-style="western">
              <surname>Meusel</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Bizer</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>Graph structure in the Web: aggregated by pay-level domain</article-title>
          <source>Proc 2014 ACM Conf Web Sci</source>
          <year>2014</year>
          <conf-name>ACM conference on Web science</conf-name>
          <conf-date>2014</conf-date>
          <conf-loc>Bloomington</conf-loc>
          <fpage>119</fpage>
          <lpage>128</lpage>
          <pub-id pub-id-type="doi">10.1145/2615569.2615674</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref62">
        <label>62</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Gross</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Yellen</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <source>Handbook of Graph Theory</source>
          <year>2004</year>
          <publisher-loc>Boca Raton</publisher-loc>
          <publisher-name>CRC Press</publisher-name>
        </nlm-citation>
      </ref>
      <ref id="ref63">
        <label>63</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Blondel</surname>
              <given-names>VD</given-names>
            </name>
            <name name-style="western">
              <surname>Guillaume</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Lambiotte</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Lefebvre</surname>
              <given-names>E</given-names>
            </name>
          </person-group>
          <article-title>Fast unfolding of communities in large networks</article-title>
          <source>J Stat Mech</source>
          <year>2008</year>
          <month>10</month>
          <day>09</day>
          <volume>2008</volume>
          <issue>10</issue>
          <fpage>P10008</fpage>
          <pub-id pub-id-type="doi">10.1088/1742-5468/2008/10/P10008</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref64">
        <label>64</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Page</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Brin</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Motwani</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Winograd</surname>
              <given-names>T</given-names>
            </name>
          </person-group>
          <source>The PageRank citation ranking: bringing order to the web</source>
          <year>1999</year>
          <access-date>2020-06-08</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://ilpubs.stanford.edu:8090/422/1/1999-66.pdf">http://ilpubs.stanford.edu:8090/422/1/1999-66.pdf</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref65">
        <label>65</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Albert</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Barabási</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Statistical mechanics of complex networks</article-title>
          <source>Rev Mod Phys</source>
          <year>2002</year>
          <month>1</month>
          <day>30</day>
          <volume>74</volume>
          <issue>1</issue>
          <fpage>47</fpage>
          <lpage>97</lpage>
          <pub-id pub-id-type="doi">10.1103/revmodphys.74.47</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref66">
        <label>66</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Vieira</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Barbosa</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>da Silva</surname>
              <given-names>AS</given-names>
            </name>
            <name name-style="western">
              <surname>Freire</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Moura</surname>
              <given-names>E</given-names>
            </name>
          </person-group>
          <article-title>Finding seeds to bootstrap focused crawlers</article-title>
          <source>World Wide Web</source>
          <year>2015</year>
          <month>2</month>
          <day>26</day>
          <volume>19</volume>
          <issue>3</issue>
          <fpage>449</fpage>
          <lpage>474</lpage>
          <pub-id pub-id-type="doi">10.1007/s11280-015-0331-7</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref67">
        <label>67</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Teregowda</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Ramírez</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Mitra</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Zheng</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Giles</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>The evolution of a crawling strategy for an academic document search engine: whitelists and blacklists</article-title>
          <source>Proc 4th Ann ACM Web Sci Conf</source>
          <year>2012</year>
          <fpage>340</fpage>
          <lpage>343</lpage>
          <pub-id pub-id-type="doi">10.1145/2380718.2380762</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref68">
        <label>68</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zheng</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Dmitriev</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Giles</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>Graph based crawler seed selection</article-title>
          <source>Proc 18th Int Conf World Wide Web</source>
          <year>2009</year>
          <fpage>1089</fpage>
          <lpage>1090</lpage>
          <pub-id pub-id-type="doi">10.1145/1526709.1526870</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref69">
        <label>69</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Prasath</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Oztürk</surname>
              <given-names>P</given-names>
            </name>
          </person-group>
          <article-title>Finding potential seeds through rank aggregation of web searches</article-title>
          <source>Proc 4th Int Conf Pattern Recog Mach Intell</source>
          <year>2011</year>
          <fpage>227</fpage>
          <lpage>234</lpage>
          <pub-id pub-id-type="doi">10.1007/978-3-642-21786-9_38</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref70">
        <label>70</label>
        <nlm-citation citation-type="web">
          <article-title>Curlie</article-title>
          <source>Curlie Project Inc</source>
          <year>2019</year>
          <access-date>2019-09-10</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://curlie.org/">https://curlie.org/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref71">
        <label>71</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Chakrabarti</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Punera</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Subramanyam</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Accelerated focused crawling through online relevance feedback</article-title>
          <source>Proc 11th Int Conf World Wide Web</source>
          <year>2002</year>
          <fpage>148</fpage>
          <lpage>159</lpage>
          <pub-id pub-id-type="doi">10.1145/511446.511466</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref72">
        <label>72</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zowalla</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Wiesner</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Pfeifer</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>Analyzing the German Health Web using a focused crawling approach</article-title>
          <year>2016</year>
          <conf-name>Health—Exploring Complexity: An Interdisciplinary Systems Approach</conf-name>
          <conf-date>2016</conf-date>
          <conf-loc>Munich</conf-loc>
        </nlm-citation>
      </ref>
      <ref id="ref73">
        <label>73</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ganjisaffar</surname>
              <given-names>Y</given-names>
            </name>
          </person-group>
          <source>crawler4j: an open source crawler for Java</source>
          <year>2016</year>
          <access-date>2019-09-10</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://github.com/yasserg/crawler4j">https://github.com/yasserg/crawler4j</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref74">
        <label>74</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Maier-Hein</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Mersmann</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Kondermann</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Bodenstedt</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Sanchez</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Stock</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Kenngott</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Eisenmann</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Speidel</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Can masses of non-experts train highly accurate image classifiers?</article-title>
          <source>Med Image Comput Comput Assist Interv</source>
          <year>2014</year>
          <volume>17</volume>
          <issue>Pt 2</issue>
          <fpage>438</fpage>
          <lpage>445</lpage>
          <pub-id pub-id-type="doi">10.1007/978-3-319-10470-6_55</pub-id>
          <pub-id pub-id-type="medline">25485409</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref75">
        <label>75</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Maier-Hein</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Kondermann</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Roß</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Mersmann</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Heim</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Bodenstedt</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Kenngott</surname>
              <given-names>HG</given-names>
            </name>
            <name name-style="western">
              <surname>Sanchez</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Wagner</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Preukschas</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Wekerle</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Helfert</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>März</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Mehrabi</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Speidel</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Stock</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>Crowdtruth validation: a new paradigm for validating algorithms that rely on image correspondences</article-title>
          <source>Int J Comput Assist Radiol Surg</source>
          <year>2015</year>
          <month>08</month>
          <volume>10</volume>
          <issue>8</issue>
          <fpage>1201</fpage>
          <lpage>1212</lpage>
          <pub-id pub-id-type="doi">10.1007/s11548-015-1168-3</pub-id>
          <pub-id pub-id-type="medline">25895078</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref76">
        <label>76</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Irshad</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Oh</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Schmolze</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Quintana</surname>
              <given-names>LM</given-names>
            </name>
            <name name-style="western">
              <surname>Collins</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Tamimi</surname>
              <given-names>RM</given-names>
            </name>
            <name name-style="western">
              <surname>Beck</surname>
              <given-names>AH</given-names>
            </name>
          </person-group>
          <article-title>Crowdsourcing scoring of immunohistochemistry images: evaluating performance of the crowd and an automated computational method</article-title>
          <source>Sci Rep</source>
          <year>2017</year>
          <month>02</month>
          <day>23</day>
          <volume>7</volume>
          <fpage>43286</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://dx.doi.org/10.1038/srep43286"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/srep43286</pub-id>
          <pub-id pub-id-type="medline">28230179</pub-id>
          <pub-id pub-id-type="pii">srep43286</pub-id>
          <pub-id pub-id-type="pmcid">PMC5322394</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref77">
        <label>77</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>O'Neil</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Murchison</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>van Beek</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Goatman</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <source>Crowdsourcing Labels for Pathological Patterns in CT Lung Scans: Can Non-experts Contribute Expert-Quality Ground Truth? Intravascular Imaging and Computer Assisted Stenting, and Large-Scale Annotation of Biomedical Data and Expert Label Synthesis</source>
          <year>2017</year>
          <publisher-loc>Berlin</publisher-loc>
          <publisher-name>Springer International Publishing</publisher-name>
          <fpage>96</fpage>
          <lpage>105</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref78">
        <label>78</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Park</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Mirhosseini</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Nadeem</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Marino</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Kaufman</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Baker</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Barish</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Crowdsourcing for identification of polyp-free segments in virtual colonoscopy videos</article-title>
          <source>Medical Imaging 2017: Imaging Informatics for Healthcare, Research, and Applications International Society for Optics and Photonics</source>
          <year>2017</year>
          <fpage>1</fpage>
          <pub-id pub-id-type="doi">10.1117/12.2252281</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref79">
        <label>79</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Jha</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Andreas</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Thadani</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Rosenthal</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>McKeown</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>Corpus Creation for New Genres: A Crowdsourced Approach to PP Attachment</article-title>
          <source>Workshop on Creating Speech and Language Data with Amazon’s Mechanical Turk</source>
          <year>2010</year>
          <fpage>13</fpage>
          <lpage>20</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.aclweb.org/anthology/W10-0702.pdf"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref80">
        <label>80</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Sabou</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Bontcheva</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Derczynski</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Scharl</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Corpus annotation through crowdsourcing: towards best practice guidelines</article-title>
          <year>2014</year>
          <conf-name>Proc 9th Int Conf Lang Resources Eval</conf-name>
          <conf-date>2014</conf-date>
          <conf-loc>Reykjavik</conf-loc>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://www.lrec-conf.org/proceedings/lrec2014/pdf/497_Paper.pdf"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref81">
        <label>81</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Simperl</surname>
              <given-names>E</given-names>
            </name>
          </person-group>
          <article-title>How to use crowdsourcing effectively: guidelines and examples</article-title>
          <source>LIBER Q</source>
          <year>2015</year>
          <month>08</month>
          <day>18</day>
          <volume>25</volume>
          <issue>1</issue>
          <fpage>18</fpage>
          <lpage>39</lpage>
          <pub-id pub-id-type="doi">10.18352/lq.9948</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref82">
        <label>82</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Gadiraju</surname>
              <given-names>U</given-names>
            </name>
            <name name-style="western">
              <surname>Fetahu</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Kawase</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>Training workers for improving performance in crowdsourcing microtasks</article-title>
          <source>Design Teaching Learn Netw World</source>
          <year>2015</year>
          <fpage>110</fpage>
          <lpage>114</lpage>
          <pub-id pub-id-type="doi">10.1007/978-3-319-24258-3_8</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref83">
        <label>83</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hube</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Fetahu</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Gadiraju</surname>
              <given-names>U</given-names>
            </name>
          </person-group>
          <article-title>Understanding and mitigating worker biases in the crowdsourced collection of subjective judgments</article-title>
          <source>Conf Hum Factors Comput Syst</source>
          <year>2019</year>
          <fpage>1</fpage>
          <lpage>12</lpage>
          <pub-id pub-id-type="doi">10.1145/3290605.3300637</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref84">
        <label>84</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Carvalho</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Dimitrov</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Larson</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>How many crowdsourced workers should a requester hire?</article-title>
          <source>Ann Math Artif Intell</source>
          <year>2016</year>
          <month>1</month>
          <day>6</day>
          <volume>78</volume>
          <issue>1</issue>
          <fpage>45</fpage>
          <lpage>72</lpage>
          <pub-id pub-id-type="doi">10.1007/s10472-015-9492-4</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref85">
        <label>85</label>
        <nlm-citation citation-type="web">
          <source>Figure Eight: the Essential High-Quality Data Annotation Platform</source>
          <access-date>2019-10-02</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.figure-eight.com/">https://www.figure-eight.com/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref86">
        <label>86</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lombard</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Snyder-Duch</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Bracken</surname>
              <given-names>CC</given-names>
            </name>
          </person-group>
          <article-title>Content analysis in mass communication: assessment and reporting of intercoder reliability</article-title>
          <source>Human Comm Res</source>
          <year>2002</year>
          <month>10</month>
          <volume>28</volume>
          <issue>4</issue>
          <fpage>587</fpage>
          <lpage>604</lpage>
          <pub-id pub-id-type="doi">10.1111/j.1468-2958.2002.tb00826.x</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref87">
        <label>87</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Fleiss</surname>
              <given-names>JL</given-names>
            </name>
          </person-group>
          <article-title>Measuring nominal scale agreement among many raters</article-title>
          <source>Psychol Bull</source>
          <year>1971</year>
          <volume>76</volume>
          <issue>5</issue>
          <fpage>378</fpage>
          <lpage>382</lpage>
          <pub-id pub-id-type="doi">10.1037/h0031619</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref88">
        <label>88</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Landis</surname>
              <given-names>JR</given-names>
            </name>
            <name name-style="western">
              <surname>Koch</surname>
              <given-names>GG</given-names>
            </name>
          </person-group>
          <article-title>The Measurement of Observer Agreement for Categorical Data</article-title>
          <source>Biometrics</source>
          <year>1977</year>
          <month>03</month>
          <volume>33</volume>
          <issue>1</issue>
          <fpage>159</fpage>
          <lpage>174</lpage>
          <pub-id pub-id-type="doi">10.2307/2529310</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref89">
        <label>89</label>
        <nlm-citation citation-type="web">
          <source>The Neo4j Graph Algorithms User Guide v3.5</source>
          <access-date>2019-09-13</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://neo4j.com/docs/graph-algorithms/current/">https://neo4j.com/docs/graph-algorithms/current/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref90">
        <label>90</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Bastian</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Heymann</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Jacomy</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <source>Gephi: an open source software for exploring and manipulating networks</source>
          <year>2009</year>
          <access-date>2020-06-08</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.aaai.org/ocs/index.php/ICWSM/09/paper/viewFile/154/1009">https://www.aaai.org/ocs/index.php/ICWSM/09/paper/viewFile/154/1009</ext-link>
          </comment>
        </nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
