<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article" dtd-version="2.0">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JMIR</journal-id>
      <journal-id journal-id-type="nlm-ta">J Med Internet Res</journal-id>
      <journal-title>Journal of Medical Internet Research</journal-title>
      <issn pub-type="epub">1438-8871</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
    <article-id pub-id-type="publisher-id">v21i1e10986</article-id>
    <article-id pub-id-type="pmid">30698536</article-id>
    <article-id pub-id-type="doi">10.2196/10986</article-id>
    <article-categories>
      <subj-group subj-group-type="heading">
        <subject>Original Paper</subject>
      </subj-group>
      <subj-group subj-group-type="article-type">
        <subject>Original Paper</subject>
      </subj-group>
    </article-categories>
    <title-group>
      <article-title>Consumer Health Search on the Web: Study of Web Page Understandability and Its Integration in Ranking Algorithms</article-title>
    </title-group>
    <contrib-group>
      <contrib contrib-type="editor">
        <name>
          <surname>Eysenbach</surname>
          <given-names>Gunther</given-names>
        </name>
      </contrib>
    </contrib-group>
    <contrib-group>
      <contrib contrib-type="reviewer">
        <name>
          <surname>Bond</surname>
          <given-names>Carol</given-names>
        </name>
      </contrib>
      <contrib contrib-type="reviewer">
        <name>
          <surname>Adeleke</surname>
          <given-names>Ibrahim</given-names>
        </name>
      </contrib>
    </contrib-group>
    <contrib-group>
      <contrib contrib-type="author" id="contrib1">
        <name name-style="western">
          <surname>Palotti</surname>
          <given-names>Joao</given-names>
        </name>
        <degrees>MSc</degrees>
        <xref rid="aff1" ref-type="aff">1</xref>
        <xref rid="aff2" ref-type="aff">2</xref>
        <ext-link ext-link-type="orcid">http://orcid.org/0000-0002-7099-9716</ext-link>
      </contrib>
      <contrib contrib-type="author" id="contrib2">
        <name name-style="western">
          <surname>Zuccon</surname>
          <given-names>Guido</given-names>
        </name>
        <degrees>PhD</degrees>
        <xref rid="aff3" ref-type="aff">3</xref>
        <ext-link ext-link-type="orcid">http://orcid.org/0000-0003-0271-5563</ext-link>
      </contrib>
      <contrib contrib-type="author" id="contrib3" corresp="yes">
      <name name-style="western">
        <surname>Hanbury</surname>
        <given-names>Allan</given-names>
      </name>
      <degrees>PhD</degrees>
      <xref rid="aff2" ref-type="aff">2</xref>
      <address>
        <institution>Institute for Information Systems Engineering</institution>
        <institution>Technische Universität Wien</institution>
        <addr-line>Favoritenstraße 9-11/194 04</addr-line>
        <addr-line>Vienna, 1040</addr-line>
        <country>Austria</country>
        <phone>43 158801188310</phone>
        <email>allan.hanbury@tuwien.ac.at</email>
      </address>  
      <xref rid="aff4" ref-type="aff">4</xref>
      <ext-link ext-link-type="orcid">http://orcid.org/0000-0002-7149-5843</ext-link></contrib>
    </contrib-group>
    <aff id="aff1">
      <label>1</label>
      <institution>Qatar Computing Research Institute</institution>
      <addr-line>Doha</addr-line>
      <country>Qatar</country>
    </aff>
    <aff id="aff2">
    <label>2</label>
    <institution>Institute for Information Systems Engineering</institution>
    <institution>Technische Universität Wien</institution>  
    <addr-line>Vienna</addr-line>
    <country>Austria</country></aff>
    <aff id="aff3">
      <label>3</label>
      <institution>University of Queensland</institution>
      <addr-line>Brisbane</addr-line>
      <country>Australia</country>
    </aff>
    <aff id="aff4">
      <label>4</label>
      <institution>Complexity Science Hub Vienna</institution>
      <addr-line>Vienna</addr-line>
      <country>Austria</country>
    </aff>
    <author-notes>
      <corresp>Corresponding Author: Allan Hanbury 
      <email>allan.hanbury@tuwien.ac.at</email></corresp>
    </author-notes>
    <pub-date pub-type="collection"><month>01</month><year>2019</year></pub-date>
    <pub-date pub-type="epub">
      <day>30</day>
      <month>01</month>
      <year>2019</year>
    </pub-date>
    <volume>21</volume>
    <issue>1</issue>
    <elocation-id>e10986</elocation-id>
    <!--history from ojs - api-xml-->
    <history>
      <date date-type="received">
        <day>7</day>
        <month>5</month>
        <year>2018</year>
      </date>
      <date date-type="rev-request">
        <day>28</day>
        <month>6</month>
        <year>2018</year>
      </date>
      <date date-type="rev-recd">
        <day>23</day>
        <month>8</month>
        <year>2018</year>
      </date>
      <date date-type="accepted">
        <day>23</day>
        <month>9</month>
        <year>2018</year>
      </date>
    </history>
    <!--(c) the authors - correct author names and publication date here if necessary. Date in form ', dd.mm.yyyy' after jmir.org-->
    <copyright-statement>©Joao Palotti, Guido Zuccon, Allan Hanbury. Originally published in the Journal of Medical Internet Research (http://www.jmir.org), 30.01.2019.</copyright-statement>
    <copyright-year>2019</copyright-year>
    <license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
      <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (https://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in the Journal of Medical Internet Research, is properly cited. The complete bibliographic information, a link to the original publication on http://www.jmir.org/, as well as this copyright and license information must be included.</p>
    </license>  
    <self-uri xlink:href="http://www.jmir.org/2019/1/e10986/" xlink:type="simple"/>
    <abstract>
      <sec sec-type="background">
        <title>Background</title>
        <p>Understandability plays a key role in ensuring that people accessing health information are capable of gaining insights that can assist them with their health concerns and choices. The access to unclear or misleading information has been shown to negatively impact the health decisions of the general public.</p>
      </sec>
      <sec sec-type="objective">
        <title>Objective</title>
        <p>The aim of this study was to investigate methods to estimate the understandability of health Web pages and use these to improve the retrieval of information for people seeking health advice on the Web.</p>
      </sec>
      <sec sec-type="methods">
        <title>Methods</title>
        <p>Our investigation considered methods to automatically estimate the understandability of health information in Web pages, and it provided a thorough evaluation of these methods using human assessments as well as an analysis of preprocessing factors affecting understandability estimations and associated pitfalls. Furthermore, lessons learned for estimating Web page understandability were applied to the construction of retrieval methods, with specific attention to retrieving information understandable by the general public.</p>
      </sec>
      <sec sec-type="results">
        <title>Results</title>
        <p>We found that machine learning techniques were more suitable to estimate health Web page understandability than traditional readability formulae, which are often used as guidelines and benchmark by health information providers on the Web (larger difference found for Pearson correlation of .602 using gradient boosting regressor compared with .438 using Simple Measure of Gobbledygook Index with the Conference and Labs of the Evaluation Forum eHealth 2015 collection).</p>
      </sec>
      <sec sec-type="conclusions">
        <title>Conclusions</title>
        <p>The findings reported in this paper are important for specialized search services tailored to support the general public in seeking health advice on the Web, as they document and empirically validate state-of-the-art techniques and settings for this domain application.</p>
      </sec>
    </abstract>
    <kwd-group>
      <kwd>readability</kwd>
      <kwd>literacy</kwd>
      <kwd>comprehension</kwd>
      <kwd>patients</kwd>
      <kwd>machine learning</kwd>
    </kwd-group></article-meta>
  </front>
  <body>
    <sec sec-type="introduction">
      <title>Introduction</title>
      <sec>
        <title>Background</title>
        <p>Search engines are concerned with retrieving relevant information to support a user’s information-seeking task. Commonly, signals about the topicality or aboutness of a piece of information with respect to a query are used to estimate relevance, with other relevance dimensions such as understandability and trustworthiness [<xref ref-type="bibr" rid="ref1">1</xref>] being relegated to a secondary position or completely neglected. Although this might be a minor problem for many information-seeking tasks, there are some specific tasks in which dimensions other than topicality have an important role in the information seeking and decision-making process. The seeking of health information and advice on the Web by the general public is one such task.</p>
        <p>A key problem when searching the Web for health information is that this can be too technical, unreliable, generally misleading, and can lead to unfounded escalations and poor decisions [<xref ref-type="bibr" rid="ref2">2</xref>-<xref ref-type="bibr" rid="ref4">4</xref>]. Where correct information exists, it can be hard to find and digest among the noise, spam, technicalities, and irrelevant information. In <italic>high-stakes search tasks</italic> such as this, access to poor information can lead to poor decisions, which ultimately can have a significant impact on our health and well-being [<xref ref-type="bibr" rid="ref4">4</xref>,<xref ref-type="bibr" rid="ref5">5</xref>]. In this study, we are specifically interested in the understandability of health information retrieved by search engines and in improving search results to favor information understandable by the general public. We leave addressing reliability and trustworthiness of the retrieved information to future work; however, this can be achieved by extending the framework we investigate here.</p>
        <p>The use of general purpose Web search engines such as Google, Bing, and Baidu for seeking health advice has been largely analyzed, questioned, and criticized [<xref ref-type="bibr" rid="ref6">6</xref>-<xref ref-type="bibr" rid="ref11">11</xref>], despite the commendable efforts these services have put into providing increasingly better health information, for example, the Google Health Cards [<xref ref-type="bibr" rid="ref12">12</xref>].</p>
        <p>Ad hoc solutions to support the general public in searching and accessing health information on the Web have been implemented, typically supported by government initiatives or medical practitioner associations, for example, HealthOnNet.org (HON [<xref ref-type="bibr" rid="ref13">13</xref>]) and HealthDirect.gov.au, among others. These solutions aim to provide <italic>better</italic> health information to the general public. For example, HON’s mission statement is “to guide Internet users to reliable, understandable, accessible and trustworthy sources of medical and health information.” On the contrary, do the solutions that these services currently employ actually provide this type of information to the health-seeking general public?</p>
        <p>As an illustrative example, we analyzed the top 10 search results retrieved by HON on October 01, 2017 in answer to 300 health search queries generated by regular health consumers in health forums. These queries are part of the Conference and Labs of the Evaluation Forum (CLEF) 2016 electronic health (eHealth) collection [<xref ref-type="bibr" rid="ref14">14</xref>], which is extensively used in this paper. The understandability score of the retrieved pages was estimated with the most effective readability formula (RF) and preprocessing settings analyzed in this paper (low scores correspond to easy to understand Web pages). <xref ref-type="fig" rid="figure1">Figure 1</xref> reports the cumulative distribution of understandability scores for these search results (note, we did not assess their topical relevance here). Dale-Chall Index (DCI) measures the years of schooling required to understand a document. The average US resident reads at or below an 8th grade level [<xref ref-type="bibr" rid="ref15">15</xref>-<xref ref-type="bibr" rid="ref18">18</xref>], which is the level suggested by the American National Institutes of Health for health information on the Web [<xref ref-type="bibr" rid="ref19">19</xref>]. We also report the scores for the <italic>optimal</italic> search results (Oracle), as found in CLEF 2016 (relevant results that have the highest understandability scores), along with the scores for the baseline method (Best Match 25 [BM25]) and our best retrieval method, eXtreme Gradient Boosting (XGB). The results clearly indicate that despite solutions such as HON being explicitly aimed at supporting access to high-quality health information that can aid the user to take well-informed health decisions, they often fail to direct the users to information they can understand.</p>
        <p>In this paper, we aim to establish methods and best practice for developing search engines that retrieve <italic>relevant and understandable</italic> health advice from the Web. The overall contributions of this paper can be summarized as:</p>
        <list list-type="order">
          <list-item>
            <p>We propose and investigate methods for the estimation of the understandability of health information in Web pages: a large number of medically focused features are grouped in categories and their contribution to the understandability estimation task is carefully measured.</p>
          </list-item>
          <list-item>
            <p>We further study the influence of HTML processing methods on these estimations and their pitfalls, extending our previous work that has shown how this often-ignored aspect greatly impacts effectiveness [<xref ref-type="bibr" rid="ref20">20</xref>].</p>
          </list-item>
          <list-item>
            <p>We further investigate how understandability estimations can be integrated into retrieval methods to enhance the quality of the retrieved health information, with particular attention to its understandability by the general public. New models are explored in this paper, also extending our previous work [<xref ref-type="bibr" rid="ref21">21</xref>].</p>
          </list-item>
        </list>
        <p>This paper makes concrete contributions to practice, as it informs health search engines specifically tailored to the general public (eg, the HON or HealthDirect services referred to above) about the best methods they should adopt. These are novel and significant contributions as no previous work has systematically analyzed the influence of the components in this study—we show that these greatly influence retrieval effectiveness and, thus, delivery of relevant and understandable health advice.</p>
        </sec>
      <sec>
        <title>Related Work</title>
        <p>Understandability refers to the ease of comprehension of the information presented to a user. In other words, health information is understandable “when consumers of diverse backgrounds and varying levels of health literacy can process and explain key messages” [<xref ref-type="bibr" rid="ref22">22</xref>]. Often the terms understandability and readability are used interchangeably: we use readability to refer to formulae that estimate how easy it is to understand a text, usually based on its words and sentences. We use understandability to refer to the broader concept of ease of understanding: this is affected by text readability (as increasing readability tends to improve understanding) but might also be influenced by how legible a text is and its layout, including, for example, the use of images to explain difficult concepts.</p>
        
        
        <fig id="figure1" position="float">
          <label>Figure 1</label>
          <caption>
            <p>Cumulative distribution of Dale-Chall Index (DCI) of search results. DCI measures the years of schooling required to understand a document. The dashed line is the 8th grade level which is the reading level of an average US resident. The distribution for HealthOnNet (HON) is similar to that of the baseline used in this paper (Best Match 25 [BM25]). Our best method (eXtreme Gradient Boosting [XGB]) reranks documents to provide more understandable results; its distribution is similar to that of an oracle system.</p>
          </caption>
          <graphic xlink:href="jmir_v21i1e10986_fig1.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        
        <p>There is a large body of literature that has examined the understandability of Web health content when the information seeker is a member of the general public. For example, Becker reported that the majority of health websites are not well designed for the elderly [<xref ref-type="bibr" rid="ref23">23</xref>], whereas Stossel et al found that health education material on the Web is not written at an adequate reading level [<xref ref-type="bibr" rid="ref18">18</xref>]. Zheng and Yu have reported on the readability of electronic health records compared with Wikipedia pages related to diabetes and found that readability measures often do not align with user ratings of readability [<xref ref-type="bibr" rid="ref24">24</xref>]. A common finding of these studies is that, in general, health content available on Web pages is often hard to understand by the general public; this includes content that is retrieved in top-ranked positions by current commercial search engines [<xref ref-type="bibr" rid="ref6">6</xref>-<xref ref-type="bibr" rid="ref11">11</xref>].</p>      
        <p>Previous linguistics and information retrieval research has attempted to devise computational methods for the automatic estimation of text readability and understandability, and for the inclusion of these within search methods or their evaluation. Computational approaches to understandability estimations include (1) <italic>RF</italic>, which generally exploit word surface characteristics of the text, (2) <italic>machine learning</italic> approaches, and (3) matching with specialized <italic>dictionaries or terminologies</italic>, often compiled with information about understandability difficulty.</p>
        <p>Measures such as Coleman-Liau Index (CLI) [<xref ref-type="bibr" rid="ref25">25</xref>], DCI [<xref ref-type="bibr" rid="ref26">26</xref>], and Flesch Reading Ease (FRE) [<xref ref-type="bibr" rid="ref27">27</xref>] belong to the first category. These measures generally rely on surface-level characteristics of text such as characters, syllables, and word counts [<xref ref-type="bibr" rid="ref28">28</xref>]. Although these measures have been widely used in studies investigating the understandability of health content retrieved by search engines [<xref ref-type="bibr" rid="ref6">6</xref>-<xref ref-type="bibr" rid="ref11">11</xref>,<xref ref-type="bibr" rid="ref18">18</xref>,<xref ref-type="bibr" rid="ref23">23</xref>]), our preliminary work found that these measures are heavily affected by the methods used to extract text from the HTML source [<xref ref-type="bibr" rid="ref20">20</xref>]. We were able to identify specific settings of an HTML preprocessing pipeline that provided consistent estimates, but because of the lack of human assessments, we were not able to investigate how well each HTML preprocessing pipeline correlated with human assessments. In this paper, we revisited and extended this work in more detail, as we further investigated this problem by comparing the effect of HTML preprocessing on text understandability estimations in light of explicit human assessments.</p>
        <p>The use of machine learning to estimate understandability forms an alternative approach. Earlier research explored the use of statistical natural language processing and language modeling [<xref ref-type="bibr" rid="ref29">29</xref>-<xref ref-type="bibr" rid="ref31">31</xref>] as well as linguistic factors such as syntactic features or lexical cohesion [<xref ref-type="bibr" rid="ref32">32</xref>]. Although we replicated here many of the features devised in these works, they focus on estimating readability of general English documents rather than medical ones. In the medical domain, Zeng et al explored features such as word frequency in different medical corpora to estimate concept familiarity, which prompted the construction of the consumer health vocabulary (CHV) [<xref ref-type="bibr" rid="ref33">33</xref>-<xref ref-type="bibr" rid="ref35">35</xref>].</p>
        <p>The actual use of CHV or other terminologies such as the Medical Subject Headings (MeSH) belongs to the third category of approaches. The CHV is a prominent medical vocabulary dedicated to mapping layperson vocabulary to technical terms [<xref ref-type="bibr" rid="ref34">34</xref>]. It attributes a score for each of its concepts with respect to their difficulty, with lower or higher scores for harder or easier concepts. Researchers have evaluated CHV in tasks such as document analysis [<xref ref-type="bibr" rid="ref36">36</xref>] and medical expertise prediction [<xref ref-type="bibr" rid="ref37">37</xref>]. The hierarchy of MeSH was previously used in the literature to identify difficult concepts, assuming that a concept deep in the hierarchy is more difficult than a shallow one [<xref ref-type="bibr" rid="ref38">38</xref>]. Other approaches combined vocabularies with word surface characteristics and syntactic features, such as part of speech (POS), into a unique readability measure [<xref ref-type="bibr" rid="ref39">39</xref>].</p>
        <p>In this study, we investigated approaches to estimate understandability from each of these categories, including measure the influence of HTML preprocessing on automatic understandability methods and establish best practices.</p>
        <p>Some previous works have attempted to use understandability estimations for improving search results in consumer health search as well as methods to evaluate retrieval systems that do account for understandability along with topical relevance. Palotti et al have used learning to rank with standard retrieval features along with features based on RF and medical lexical aspects to determine understandability [<xref ref-type="bibr" rid="ref21">21</xref>]. Van Doorn et al have shown that learning a set of rankers that provide trade-offs across a number of relevance criteria, including readability or understandability, increases overall system effectiveness [<xref ref-type="bibr" rid="ref40">40</xref>]. Zuccon and Koopman [<xref ref-type="bibr" rid="ref41">41</xref>], and later Zuccon [<xref ref-type="bibr" rid="ref42">42</xref>], have proposed and investigated a family of measures based on the gain-discount framework, where the gain of a document is influenced by both its topical relevance and its understandability. They showed that although generally correlated, topical relevance evaluation alone provides differing system rankings compared with understandability-biased evaluation measures. In this study, we further explored the development of retrieval methods that combine signals about topical relevance and understandability.</p>
      </sec>
    </sec>
    <sec sec-type="methods">
      <title>Methods</title>
      <sec>
        <title>Data Collection</title>
        <p>In this paper, we investigated methods to estimate Web page understandability, including the effect that HTML preprocessing pipelines and heuristics have, and their search effectiveness when employed within retrieval methods. To obtain both topical relevance and understandability assessments, we used the data from the CLEF 2015 and 2016 eHealth collections. The CLEF eHealth initiative is a research community–shared task aimed at creating resources for evaluating health search engines aimed at the general public [<xref ref-type="bibr" rid="ref43">43</xref>]. Note, in the remainder of this paper, we refer to topical relevance simply as relevance, when this does not cause confusion.</p>
        <p>The CLEF 2015 collection contains 50 queries and 1437 documents that have been assessed as relevant by clinical experts and have an assessment for understandability [<xref ref-type="bibr" rid="ref44">44</xref>]. Documents in this collection are a selected crawl of health websites, of which the majority are certified HON websites. The CLEF 2016 collection contains 300 queries and 3298 relevant documents that also have been assessed with respect to understandability [<xref ref-type="bibr" rid="ref14">14</xref>]. Documents in this collection belong to the ClueWeb12 B13 corpus [<xref ref-type="bibr" rid="ref45">45</xref>], and thus are general English Web pages, not necessarily targeted to health topics nor of a controlled quality (as are the HON certified pages). Understandability assessments were provided on a 5-point Likert scale for CLEF 2015 on a 0 to 100 range for CLEF 2016 (0 indicates the highest understandability).</p>
        <p>To support the investigation of methods to automatically estimate the understandability of Web pages, we further considered correlations between multiple human assessors (interassessor agreement). For CLEF 2015, we used the publicly available additional assessments made by unpaid medical students and health consumers collected by Palotti et al [<xref ref-type="bibr" rid="ref46">46</xref>] in a study of how medical expertise affects assessments. For CLEF 2016, we collected understandability assessments for 100 documents. In total, 3 members of our research team, who did not author this paper and are not medical experts, were recruited to provide the assessments (the correlation of these additional assessments and CLEF’s ground truth is examined further in this paper). The Relevation tool [<xref ref-type="bibr" rid="ref47">47</xref>] was used to assist with the assessments, mimicking the settings used in CLEF.</p>
      </sec>
      <sec>
        <title>Understandability Estimators</title>
        <p>Several methods have been used to estimate the understandability of health Web pages, with the most popular methods (at least in the biomedical literature) being RF based on surface level characteristics of the text. Next, we outline the categories of methods to estimate understandability used in this study; an overview is shown in <xref ref-type="boxed-text" rid="box1">Textboxes 1</xref> to <xref ref-type="boxed-text" rid="box10">10</xref>.</p>
        <sec>
          <title>Traditional Readability Formulae</title>
          <p>These include the most popular RF [<xref ref-type="bibr" rid="ref25">25</xref>-<xref ref-type="bibr" rid="ref27">27</xref>] as well as other less popular ones [<xref ref-type="bibr" rid="ref48">48</xref>-<xref ref-type="bibr" rid="ref51">51</xref>]. An extensive description of these RF is provided in surveys by Collins-Thompson [<xref ref-type="bibr" rid="ref52">52</xref>] and Dubay [<xref ref-type="bibr" rid="ref28">28</xref>]. A complete list of methods is provided in <xref ref-type="boxed-text" rid="box1">Textbox 1</xref>.</p>
          </sec>
        <sec>
          <title>Raw Components of Readability Formulae</title>
          <p>These are formed by the <italic>building blocks</italic> used in the traditional RF. Examples include the average number of characters per word and the average number of syllables in a sentence. Words are divided into syllables using the Python package Pyphen [<xref ref-type="bibr" rid="ref53">53</xref>]. A complete list of methods is provided in <xref ref-type="boxed-text" rid="box2">Textbox 2</xref>.</p>
          </sec>
        <sec>
          <title>General Medical Vocabularies</title>
          <p>These include methods that count the number of words with a medical prefix or suffix, that is, beginning or ending with Latin or Greek particles (eg, amni-, angi-, algia-, and arteri-), and text strings included in lists of acronyms or in medical vocabularies such as the International Statistical Classification of Diseases and Related Health Problems (ICD), Drugbank and the OpenMedSpel dictionary [<xref ref-type="bibr" rid="ref54">54</xref>]. An acronym list from the ADAM database [<xref ref-type="bibr" rid="ref55">55</xref>] was used. Methods in this category were matched with documents using simple keyword matching. A complete list of methods is provided in <xref ref-type="boxed-text" rid="box3">Textbox 3</xref>.</p>
          </sec>
        <sec>
          <title>Consumer Medical Vocabulary</title>
          <p>The popular MetaMap [<xref ref-type="bibr" rid="ref56">56</xref>] tool was used to map the text content of Web pages to entries in CHV [<xref ref-type="bibr" rid="ref34">34</xref>]. We used the MetaMap semantic types to retain only concepts identified as symptoms or diseases. Similar approaches have been commonly used in the literature [<xref ref-type="bibr" rid="ref57">57</xref>-<xref ref-type="bibr" rid="ref60">60</xref>]. A complete list of methods is provided in <xref ref-type="boxed-text" rid="box4">Textbox 4</xref>.</p>
          </sec>
        <sec>
          <title>Expert Medical Vocabulary</title>
          <p>Similar to the CHV features, we used MetaMap to convert the content of Web pages into MeSH entities, studying symptom and disease concepts separately. A complete list of methods is provided in <xref ref-type="boxed-text" rid="box5">Textbox 5</xref>.</p>
          </sec>
        <sec>
          <title>Natural Language Features</title>
          <p>These included commonly used natural language heuristics such as the ratio of POS classes, the height of the POS parser tree, the number of entities in the text, the sentiment polarity [<xref ref-type="bibr" rid="ref61">61</xref>], and the ratio of words found in English vocabularies. The Python package Natural Language Toolkit [<xref ref-type="bibr" rid="ref62">62</xref>] was used for sentiment analysis, POS tagging, and entity recognition. The GNU Aspell [<xref ref-type="bibr" rid="ref63">63</xref>] dictionary was used as a standard English vocabulary and a stop word list was built by merging those of Indri [<xref ref-type="bibr" rid="ref64">64</xref>] and Terrier [<xref ref-type="bibr" rid="ref65">65</xref>]. Discourse features, such as the distribution of POS classes and density of entity in a text, were previously studied in the task of understandability prediction [<xref ref-type="bibr" rid="ref66">66</xref>] and found superior to complex features such as entity coreference and entity grid [<xref ref-type="bibr" rid="ref67">67</xref>]. To the best of our knowledge, sentiment polarity was never investigated in this task. Our intuition is that the content produced by laypeople in patient forums or blogs (easy to read) is potentially more emotional than scientific publications (hard to read). A complete list of methods is provided in <xref ref-type="boxed-text" rid="box6">Textbox 6</xref>.</p>
          </sec>
        <sec>
          <title>HTML Features</title>
          <p>These include the identification of a large number of HTML tags, which were extracted with the Python library BeautifulSoup [<xref ref-type="bibr" rid="ref68">68</xref>]. The intuition for these features is that Web pages with many images and tables might explain and summarize health content better, thus providing more understandable content to the general public. A complete list of methods is provided in <xref ref-type="boxed-text" rid="box7">Textbox 7</xref>.</p>
          
          <boxed-text id="box1" position="float">
            <title>Readability formulae (RF) used to estimate understandability.</title>
            <p>Readability feature</p>
            <list list-type="bullet">
              <list-item>
                <p>Automated Readability Index [<xref ref-type="bibr" rid="ref48">48</xref>]</p>
              </list-item>
              <list-item>
                <p>Coleman-Liau Index (CLI) [<xref ref-type="bibr" rid="ref25">25</xref>]</p>
              </list-item>
              <list-item>
                <p>Dale-Chall Index (DCI) [<xref ref-type="bibr" rid="ref26">26</xref>]</p>
              </list-item>
              <list-item>
                <p>Flesch-Kincaid Grade Level [<xref ref-type="bibr" rid="ref27">27</xref>]</p>
              </list-item>
              <list-item>
                <p>Flesch Reading Ease (FRE) [<xref ref-type="bibr" rid="ref27">27</xref>]</p>
              </list-item>
              <list-item>
                <p>Gunning Fog Index (GFI) [<xref ref-type="bibr" rid="ref49">49</xref>]</p>
              </list-item>
              <list-item>
                <p>Lasbarhetsindex (LIX) [<xref ref-type="bibr" rid="ref50">50</xref>]</p>
              </list-item>
              <list-item>
                <p>Simple Measure of Gobbledygook (SMOG) [<xref ref-type="bibr" rid="ref51">51</xref>]</p>
              </list-item>
            </list>
          </boxed-text>
        
          
          <boxed-text id="box2" position="float">
            <title>Raw components of readability formulae (CRF) used to estimate understandability. For all features, raw values, values normalized by number of words in a document, and values normalized by number of sentences in a document were used.</title>
            <p>Components of readability feature</p>
            <list list-type="bullet">
              <list-item>
                <p># of Characters</p>
              </list-item>
              <list-item>
                <p># of Words</p>
              </list-item>
              <list-item>
                <p># of Sentences</p>
              </list-item>
              <list-item>
                <p># of Difficult Words (Dale-Chall list [<xref ref-type="bibr" rid="ref26">26</xref>])</p>
              </list-item>
              <list-item>
                <p># of Words Longer than 4 Characters</p>
              </list-item>
              <list-item>
                <p># of Words Longer than 6 Characters</p>
              </list-item>
              <list-item>
                <p># of Words Longer than 10 Characters</p>
              </list-item>
              <list-item>
                <p># of Words Longer than 13 Characters</p>
              </list-item>
              <list-item>
                <p># of Number of Syllables</p>
              </list-item>
              <list-item>
                <p># of Polysyllable Words (&gt;3 Syllables)</p>
              </list-item>
            </list>
          </boxed-text>
        
          
          <boxed-text id="box3" position="float">
            <title>General medical vocabulary features used to estimate understandability. For all features, raw values, values normalized by number of words in a document, and values normalized by number of sentences in a document were used.</title>
            <p>General medical vocabularies (GMVs)</p>
            <list list-type="bullet">
              <list-item>
                <p># of words with medical prefix</p>
              </list-item>
              <list-item>
                <p># of words with medical suffix</p>
              </list-item>
              <list-item>
                <p># of acronyms</p>
              </list-item>
              <list-item>
                <p># of International Statistical Classification of Diseases and Related Health Problems (ICD) concepts</p>
              </list-item>
              <list-item>
                <p># of Drugbank</p>
              </list-item>
              <list-item>
                <p># of words in medical dictionary (OpenMedSpel)</p>
              </list-item>
            </list>
          </boxed-text>
        
          
          <boxed-text id="box4" position="float">
            <title>Consumer medical vocabulary features used to estimate understandability. For all features, raw values, values normalized by number of words in a document, and values normalized by number of sentences in a document were used.</title>
            <p>Consumer medical vocabularies (CMV)</p>
            <list list-type="bullet">
              <list-item>
                <p>Consumer health vocabulary (CHV) mean score for all concepts</p>
              </list-item>
              <list-item>
                <p># of CHV concepts</p>
              </list-item>
              <list-item>
                <p>CHV mean score for symptom concepts</p>
              </list-item>
              <list-item>
                <p># of CHV symptom concepts</p>
              </list-item>
              <list-item>
                <p>CHV mean score for disease concepts</p>
              </list-item>
              <list-item>
                <p># of CHV disease concepts</p>
              </list-item>
            </list>
          </boxed-text>
        
          
          <boxed-text id="box5" position="float">
            <title>Expert medical vocabulary features used to estimate understandability. For all features, raw values, values normalized by number of words in a document, and values normalized by number of sentences in a document were used.</title>
            <p>Expert medical vocabulary (EMV)</p>
            <list list-type="bullet">
              <list-item>
                <p># of Medical Subject Headings (MeSH) concepts</p>
              </list-item>
              <list-item>
                <p>Average tree of MeSH concepts</p>
              </list-item>
              <list-item>
                <p># of MeSH symptom concepts</p>
              </list-item>
              <list-item>
                <p>Average tree of MeSH symptom concepts</p>
              </list-item>
              <list-item>
                <p># of MeSH disease concepts</p>
              </list-item>
              <list-item>
                <p>Average tree of MeSH disease concepts</p>
              </list-item>
            </list>
          </boxed-text>
        
          
          
          <boxed-text id="box6" position="float">
            <title>Natural language features used to estimate understandability. For all features, raw values, values normalized by number of words in a document, and values normalized by number of sentences in a document were used.</title>
            <p>Natural language features (NLF)</p>
            <list list-type="bullet">
              <list-item>
                <p>Positive words</p>
              </list-item>
              <list-item>
                <p>Negative words</p>
              </list-item>
              <list-item>
                <p>Neutral words</p>
              </list-item>
              <list-item>
                <p># of verbs</p>
              </list-item>
              <list-item>
                <p># of nouns</p>
              </list-item>
              <list-item>
                <p># of pronouns</p>
              </list-item>
              <list-item>
                <p># of adjectives</p>
              </list-item>
              <list-item>
                <p># of adverbs</p>
              </list-item>
              <list-item>
                <p># of adpositions</p>
              </list-item>
              <list-item>
                <p># of conjunctions</p>
              </list-item>
              <list-item>
                <p># of determiners</p>
              </list-item>
              <list-item>
                <p># of cardinal numbers</p>
              </list-item>
              <list-item>
                <p># of particles or other function words</p>
              </list-item>
              <list-item>
                <p># of other part of speech (POS; foreign words and typos)</p>
              </list-item>
              <list-item>
                <p># of punctuation</p>
              </list-item>
              <list-item>
                <p># of entities</p>
              </list-item>
              <list-item>
                <p>Height of POS parser tree</p>
              </list-item>
              <list-item>
                <p># of stop words</p>
              </list-item>
              <list-item>
                <p># of words not found in Aspell Engish dictionary</p>
              </list-item>
              <list-item>
                <p>Average tree of Medical Subject Headings (MeSH) disease concepts</p>
              </list-item>
            </list>
          </boxed-text>
        
        </sec>
        <sec>
          <title>Word Frequency Features</title>
          <p>Generally speaking, common and known words are usually frequent words, whereas unknown and obscure words are generally rare. This idea is implemented in RF such as the DCI, which uses a list of common words and counts the number of words that fall outside this list (complex words) [<xref ref-type="bibr" rid="ref26">26</xref>] and has shown success in other recent approaches [<xref ref-type="bibr" rid="ref69">69</xref>,<xref ref-type="bibr" rid="ref70">70</xref>]. We extended these observations by studying corpus-wide word frequencies. In total, 3 corpora were analyzed to extract word frequencies:</p>
          <list list-type="bullet">
            <list-item>
              <p>Medical Reddit: Reddit [<xref ref-type="bibr" rid="ref71">71</xref>] is a Web forum with a sizeable user community, which is responsible for generating and moderating its content. This forum is intensively used for health purposes, for example, in the Reddit community AskDocs [<xref ref-type="bibr" rid="ref72">72</xref>], licensed nurses and doctors (subject to user identity verification) advise help seekers free of charge. We selected 6 of such communities (medical, AskDocs, AskDoctorSmeeee, Health, WomensHealth, and Mens_Health) and downloaded all user interactions available until September 1, 2017, using the Python library Python Reddit Wrapper PRAW [<xref ref-type="bibr" rid="ref73">73</xref>]. In total, 43,019 discussions were collected.</p>
            </list-item>
            <list-item>
              <p>Medical English Wikipedia: after obtaining a recent Wikipedia dump [<xref ref-type="bibr" rid="ref74">74</xref>] (May 1, 2017), we filtered papers to only those containing an Infobox in which at least one of the following words appeared as a property: ICD10, ICD9, DiseasesDB, MeSH, MeSHID, MeshName, MeshNumber, GeneReviewsName, Orphanet, eMedicine, MedlinePlus, drug_name, Drugs.com, DailyMedID, and LOINC. A Wikipedia infobox is a structured template that appears on the right of Wikipedia pages summarizing key aspects of papers. This process followed the method by Soldaini et al [<xref ref-type="bibr" rid="ref75">75</xref>], which favors precision over recall when identifying a health-related paper. This resulted in a collection of 11,868 papers.</p>
            </list-item>
            <list-item>
              <p>PubMed Central: PubMed Central is a Web-based database of biomedical literature. We used the collection distributed for the Text Retrieval Conference (TREC) 2014 and 2015 Clinical Decision Support Track [<xref ref-type="bibr" rid="ref76">76</xref>,<xref ref-type="bibr" rid="ref77">77</xref>], consisting of 733,191 papers.</p>
            </list-item>
          </list>
          
          <boxed-text id="box7" position="float">
            <title>HTML features used to estimate understandability.</title>
            <p>HTML features (HF)</p>
            <list list-type="bullet">
              <list-item>
                <p># of abbreviation (abbr tags)</p>
              </list-item>
              <list-item>
                <p># of links (A tags)</p>
              </list-item>
              <list-item>
                <p># of blockquote tags</p>
              </list-item>
              <list-item>
                <p># of bold tags</p>
              </list-item>
              <list-item>
                <p># of cite tags</p>
              </list-item>
              <list-item>
                <p># of divisions or sections (div tags)</p>
              </list-item>
              <list-item>
                <p># of forms tags</p>
              </list-item>
              <list-item>
                <p># of heading H1 tags</p>
              </list-item>
              <list-item>
                <p># of heading H2 tags</p>
              </list-item>
              <list-item>
                <p># of heading H3 tags</p>
              </list-item>
              <list-item>
                <p># of heading H4 tags</p>
              </list-item>
              <list-item>
                <p># of heading H5 tags</p>
              </list-item>
              <list-item>
                <p># of heading H6 tags</p>
              </list-item>
              <list-item>
                <p>Total # of headings (any heading H above)</p>
              </list-item>
              <list-item>
                <p># of image tags</p>
              </list-item>
              <list-item>
                <p># of input tags</p>
              </list-item>
              <list-item>
                <p># of link tags</p>
              </list-item>
              <list-item>
                <p># of description lists (DL tags)</p>
              </list-item>
              <list-item>
                <p># of unordered lists (UL tags)</p>
              </list-item>
              <list-item>
                <p># of ordered lists (OL tags)</p>
              </list-item>
              <list-item>
                <p>Total # of any list (DL+UL+OL)</p>
              </list-item>
              <list-item>
                <p># of short quotations (Q tags)</p>
              </list-item>
              <list-item>
                <p># of scripts tags</p>
              </list-item>
              <list-item>
                <p># of spans tags</p>
              </list-item>
              <list-item>
                <p># of table tags</p>
              </list-item>
              <list-item>
                <p># of paragraphs (P tags)</p>
              </list-item>
            </list>
          </boxed-text>
        
          <p>A summary of the statistics of the corpora is reported in <xref ref-type="table" rid="table1">Table 1</xref>. We modeled word frequencies in a corpus in a straightforward manner: we sorted the word frequencies and normalized word rankings such that values close to 100 are attributed to common words and values close to 0 to rare words. Thereafter, we replaced each word in a document by a number ranging from 0 to 100, which represents the frequency of that word in the corpus. Finally, we extracted features based on the word frequency distribution for that document. For example, the feature <italic>75th percentile English Wikipedia</italic> is a number between 0 and 100 representing how frequent is the word at the 75th percentile of a document in which word frequencies were extracted from the English Wikipedia corpus. Unless explicitly stated otherwise, we ignored out-of-vocabulary (OV) words in the corpus. A complete list of methods is provided in <xref ref-type="boxed-text" rid="box8">Textbox 8</xref>.</p>
          </sec>
        <sec>
          <title>Machine Learning on Text-Regressors and Classifiers</title>
          <p>These include machine learning methods for estimating Web page understandability. Although Collins-Thompson highlighted the promise of estimating understandability using machine learning methods, a challenge is identifying the background corpus to be used for training [<xref ref-type="bibr" rid="ref52">52</xref>]. To this aim, we used the 3 corpora detailed above, and assumed understandability labels according to the expected difficulty of documents in these collections:</p>
          <list list-type="bullet">
            <list-item>
              <p>Medical Reddit (label 1): Documents in this corpus are expected to be written in a colloquial style, and thus the easiest to understand. All the conversations are, in fact, explicitly directed to assist inexpert health consumers</p>
            </list-item>
            <list-item>
              <p>Medical English Wikipedia (label 2): Documents in this corpus are expected to be less formal than scientific papers, but more formal than a Web forum like Reddit, thus somewhat more difficult to understand</p>
            </list-item>
            <list-item>
              <p>PubMed Central (label 3): Documents in this corpus are expected to be written in a highly formal style, as the target audience are physicians and biomedical researchers.</p>
            </list-item>
          </list>
          
          <table-wrap position="float" id="table1">
            <label>Table 1</label>
            <caption>
              <p>Statistics for the corpora used as background models for understandability estimations.</p>
            </caption>
            <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
              <col width="460"/>
              <col width="180"/>
              <col width="180"/>
              <col width="180"/>
              <thead>
                <tr valign="top">
                  <td>Statistics</td>
                  <td>Medical Wikipedia</td>
                  <td>Medical Reddit</td>
                  <td>PubMed Central</td>
                </tr>
              </thead>
              <tbody>
                <tr valign="top">
                  <td>Documents, n</td>
                  <td>11,868</td>
                  <td>43,019</td>
                  <td>733,191</td>
                </tr>
                <tr valign="top">
                  <td>Words, n</td>
                  <td>10,655,572</td>
                  <td>11,978,447</td>
                  <td>144,024,976</td>
                </tr>
                <tr valign="top">
                  <td>Unique words, n</td>
                  <td>467,650</td>
                  <td>317,106</td>
                  <td>2,933,167</td>
                </tr>
                <tr valign="top">
                  <td>Average words per document, mean (SD)</td>
                  <td>898.90 (1351.76)</td>
                  <td>278.45 (359.70)</td>
                  <td>227.22 (270.44)</td>
                </tr>
                <tr valign="top">
                  <td>Average characters per document, mean (SD)</td>
                  <td>5107.81(7618.57)</td>
                  <td>1258.44 (1659.96)</td>
                  <td>1309.11(1447.31)</td>
                </tr>
                <tr valign="top">
                  <td>Average characters per word, mean (SD)</td>
                  <td>5.68 (3.75)</td>
                  <td>4.52 (3.52)</td>
                  <td>5.76 (3.51)</td>
                </tr>
              </tbody>
            </table>
          </table-wrap>
          
          <boxed-text id="box8" position="float">
            <title>Word frequency features used to estimate understandability.</title>
            <p>Word frequency features (WFF)</p>
            <list list-type="bullet">
              <list-item>
                <p>25th percentile English Wikipedia</p>
              </list-item>
              <list-item>
                <p>50th percentile English Wikipedia</p>
              </list-item>
              <list-item>
                <p>75th percentile English Wikipedia</p>
              </list-item>
              <list-item>
                <p>Mean rank English Wikipedia</p>
              </list-item>
              <list-item>
                <p>Mean rank English Wikipedia—includes out-of-vocabulary (OV) words</p>
              </list-item>
              <list-item>
                <p>25th percentile Medical Reddit</p>
              </list-item>
              <list-item>
                <p>50th percentile Medical Reddit</p>
              </list-item>
              <list-item>
                <p>75th percentile Medical Reddit</p>
              </list-item>
              <list-item>
                <p>Mean rank Medical Reddit</p>
              </list-item>
              <list-item>
                <p>Mean rank Medical Reddit—includes OV</p>
              </list-item>
              <list-item>
                <p>25th percentile Pubmed</p>
              </list-item>
              <list-item>
                <p>50th percentile Pubmed</p>
              </list-item>
              <list-item>
                <p>75th percentile Pubmed</p>
              </list-item>
              <list-item>
                <p>Mean rank Pubmed</p>
              </list-item>
              <list-item>
                <p>Mean rank Pubmed—includes OV</p>
              </list-item>
              <list-item>
                <p>25th percentile Wikipedia+Reddit+Pubmed</p>
              </list-item>
              <list-item>
                <p>50th percentile Wikipedia+Reddit+Pubmed</p>
              </list-item>
              <list-item>
                <p>75th percentile Wikipedia+Reddit+Pubmed</p>
              </list-item>
              <list-item>
                <p>Mean rank Wikipedia+Reddit+Pubmed</p>
              </list-item>
              <list-item>
                <p>Mean rank Wikipedia+Reddit+Pubmed—includes OV</p>
              </list-item>
            </list>
          </boxed-text>
        
          <boxed-text id="box9" position="float">
            <title>Machine learning regressor features used to estimate understandability.</title>
            <p>Machine learning regressors (MLR)</p>
            <list list-type="bullet">
              <list-item>
                <p>Linear regressor</p>
              </list-item>
              <list-item>
                <p>Multilayer perceptron regressor</p>
              </list-item>
              <list-item>
                <p>Random forest regressor</p>
              </list-item>
              <list-item>
                <p>Support vector machine regressor</p>
              </list-item>
              <list-item>
                <p>eXtreme Gradient Boosting Regressor</p>
              </list-item>
            </list>
          </boxed-text>
          <boxed-text id="box10" position="float">
            <title>Machine learning classifier features used to estimate understandability.</title>
            <p>Machine learning classifiers (MLC)</p>
            <list list-type="bullet">
              <list-item>
                <p>Logistic regression</p>
              </list-item>
              <list-item>
                <p>Multilayer perceptron classifier</p>
              </list-item>
              <list-item>
                <p>Random forest classifier</p>
              </list-item>
              <list-item>
                <p>Support vector machine classifier</p>
              </list-item>
              <list-item>
                <p>Multinomial naive Bayes</p>
              </list-item>
              <list-item>
                <p>eXtreme Gradient Boosting Classifier</p>
              </list-item>
            </list>
          </boxed-text>
          <p>On the basis of the labels of each class above, models were learnt using all documents from these corpora after features were extracted using latent semantic analysis with ten dimensions on top of TF-IDF calculated for each word. We modeled a classification task as well as a regression task using these corpora. In the classification task, the first step is to train a classifier on documents belonging to these three collections with the three different classes shown above. The second step is to use the classifier to estimate which of these three possible classes an unseen document from the CLEF 2015 or CLEF 2016 would belong. Similarly, in the regression task, after training, a regressor has to estimate an understandability value to an unseen CLEF document. We hypothesize that documents that are more difficult to read are more similar to PubMed documents than to Wikipedia or Reddit ones. A complete list of methods is provided in <xref ref-type="boxed-text" rid="box9">Textboxes 9</xref> and <xref ref-type="boxed-text" rid="box10">10</xref>.</p>
        </sec>
      </sec>
      <sec>
        <title>Preprocessing Pipelines and Heuristics</title>
        <p>As part of our study, we investigated the influence that the preprocessing of Web pages had on the estimation of understandability computed using the methods described above. We did so by comparing the combination of a number of preprocessing pipelines, heuristics, and understandability estimation methods with human assessments of Web page understandability. Our experiments extended our previous work [<xref ref-type="bibr" rid="ref20">20</xref>] and provided a much more thorough analysis, as they only evaluated surface level RF and did not compare their results against human assessments.</p>
        <p>To extract the content of a Web page from the HTML source we tested: BeautifulSoup, <italic>Naive</italic> [<xref ref-type="bibr" rid="ref68">68</xref>], which just naively removes HTML tags and Boilerpipe, <italic>Boi</italic> [<xref ref-type="bibr" rid="ref78">78</xref>] and Justext, <italic>Jst</italic> [<xref ref-type="bibr" rid="ref79">79</xref>], which eliminates boilerplate text together with HTML tags. Our data analysis in Palotti et al [<xref ref-type="bibr" rid="ref20">20</xref>] highlighted that the text in HTML fields such as titles, menus, tables, and lists often missed a correct punctuation mark, and thus, the text extracted from them could be interpreted as many short sentences or few very long sentences, depending on whether a period was forced at the end of fields or sentences. We, thus, implemented the same 2 heuristics devised to deal with this: <italic>ForcePeriod (FP)</italic> and <italic>DoNotForcePeriod (DNFP)</italic>. If a punctuation mark is found at the end of a field or sentence, it is kept as it is. However, if no punctuation mark is found at the end of a field or sentence, the FP heuristic forces the insertion of a period at the end of that extracted HTML field, whereas the DNFP does not.</p>
      </sec>
      <sec>
        <title>Integrating Understandability into Retrieval</title>
        <p>We then investigated how understandability estimations can be integrated into retrieval methods to increase the quality of search results. Specifically, we considered 3 retrieval methods of differing quality for the initial retrieval. These included the best 2 runs submitted to each CLEF task, and a plain BM25 baseline (default Terrier parameters: b=0.75 and k<sub>1</sub>=1.2). BM25 is a probabilistic term weighting scheme commonly used in information retrieval and is defined with respect to the frequency of a term in a document, the collection frequency of that term, and the ratio between the length of the document and the average document length. As understandability estimators, we used the XGB regressor [<xref ref-type="bibr" rid="ref80">80</xref>] as well as Simple Measure of Gobbledygook (SMOG) for CLEF 2015 and DCI for CLEF 2016. These were selected as they were the best performing RF and machine learning methods for each collection (details on the evaluation of understandability estimators presented in the Results section). Remember that, as described in the <italic>Related Work</italic> section, RF are a specific approach to estimate understandability. Note that in XGB, for assessed documents we used 10-fold cross validation, training XGB on 90% of the data, and used its predictions for the remaining 10%. For unassessed documents, we trained XGB on all assessed data and applied this model to generate predictions. Different machine learning methods and feature selection schemes were experimented with; results are available in the <xref ref-type="app" rid="app1">Multimedia Appendix 1</xref>. XGB was selected because its results were the best among the machine learning test (which include all machine learning methods listed in <xref ref-type="boxed-text" rid="box9">Textboxes 9</xref> and <xref ref-type="boxed-text" rid="box10">10</xref>).</p>
        <p>To integrate understandability estimators into the retrieval process, we first investigated <italic>reranking</italic> search results retrieved by the initial runs purely based on the understandability estimations. If all the search results from a run were to be considered, then such a reranking method might place at early ranks Web pages highly likely to be understandable, but possibly less likely to be topically relevant. To balance relevance and understandability, we only reranked the first <italic>k</italic> documents. We explored rank cut-offs k=15,20,50. As evaluation was performed with respect to the first n=10 rank positions, the setting k=15 provided a conservative reranking of search results, whereas, k=50 provided a less conservative reranking approach.</p>
        
        <table-wrap position="float" id="table2">
          <label>Table 2</label>
          <caption>
            <p>Learning to rank settings.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="70"/>
            <col width="570"/>
            <col width="180"/>
            <col width="180"/>
            <thead>
              <tr valign="top">
                <td rowspan="2">Strategy</td>
                <td rowspan="2">Explanation</td>
                <td colspan="2">Labeling function</td>
              </tr>
              <tr valign="bottom">
                <td>CLEF<sup>a</sup> 2015</td>
                <td>CLEF 2016</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="bottom">
                <td>LTR<sup>b</sup> 1</td>
                <td>Model built <italic>only</italic> on the topicality labels with IR<sup>c</sup> features</td>
                <td>F<sup>d</sup>(R<sup>e</sup>,U<sup>f</sup>)=R</td>
                <td>F(R,U)=R</td>
              </tr>
              <tr valign="top">
                <td>LTR 2</td>
                <td>Model built <italic>only</italic> on the topicality labels with IR and understandability features</td>
                <td>F(R,U)=R</td>
                <td>F(R,U)=R</td>
              </tr>
              <tr valign="top">
                <td>LTR 3</td>
                <td>Model combines understandability and topicality labels. Uses IR and understandability features</td>
                <td>F(R,U)=R×U/3</td>
                <td>F(R,U)=R×(100-U)/100</td>
              </tr>
              <tr valign="top">
                <td>LTR 4</td>
                <td>Model built <italic>only</italic> on easy-to-read documents. Uses IR and understandability Features</td>
                <td>F(R,U)=R, if U≥2 <break/>F(R,U)=0, otherwise</td>
                <td>F(R,U)=R, if U≤40 <break/>F(R,U)=0, otherwise</td>
              </tr>
              <tr valign="top">
                <td>LTR 5</td>
                <td>Model built boosting easy-to-read documents. Uses IR and understandability Features</td>
                <td>F(R,U)=2×R, if U≥2 <break/>F(R,U)=R, otherwise</td>
                <td>F(R,U)=2×R, if U≤40 <break/>F(R,U)=R, otherwise</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table2fn1">
              <p><sup>a</sup>CLEF: Conference and Labs of the Evaluation Forum.</p>
            </fn>
            <fn id="table2fn2">
              <p><sup>b</sup>LTR: learning to rank.</p>
            </fn>
            <fn id="table2fn3">
              <p><sup>c</sup>IR: information retrieval.</p>
            </fn>
            <fn id="table2fn4">
              <p><sup>d</sup>F: function.</p>
            </fn>
            <fn id="table2fn5">
              <p><sup>e</sup>R: topical relevance of a document.</p>
            </fn>
            <fn id="table2fn6">
              <p><sup>f</sup>U: understandability.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        
        <p>As an alternative to the previous 2-step ranking strategy for combining topical relevance and understandability, we explored the <italic>fusion</italic> of 2 search result lists separately obtained for relevance and understandability. For this, we used the reciprocal rank fusion method [<xref ref-type="bibr" rid="ref81">81</xref>], which was shown effective for combining 2 lists of search results based on their documents’ <italic>ranks</italic>, rather than scores. This approach was selected above score-based fusion methods because the distribution of relevance scores for the retrieved documents differed sensibly (both in magnitude and spread) with that of understandability scores: in such a case, score-based fusion is not appropriate. For relevance, we used, separately, the 3 retrieval methods for each collection. For CLEF 2015, we used BM25 and the submissions made by the East China Normal University (ECNU) team [<xref ref-type="bibr" rid="ref82">82</xref>] and the Korean Institute of Science and Technology Information (KISTI) team [<xref ref-type="bibr" rid="ref83">83</xref>]. For CLEF2016, we also used BM25 and the submissions made by the Georgetown University Information Retrieval (GUIR) team [<xref ref-type="bibr" rid="ref84">84</xref>] and ECNU [<xref ref-type="bibr" rid="ref85">85</xref>]. For understandability, we used, separately, the estimations from SMOG or DCI and XGB. Moreover, for this approach, we studied limiting the ranking of results to be considered by the methods across the cut-offs k=15,20,50.</p>
        <p>Finally, we considered a third alternative to combine relevance and understandability: using <italic>learning to rank</italic> with features derived from retrieval methods (information retrieval (IR) features) and understandability estimators. Learning to rank refers to a family of machine learning methods where ranking models are learnt from training data (and associated features). With the CLEF 2015 and 2016 collections, we explored 5 combinations of label attribution and feature sets, maintaining the same pairwise learning to rank algorithm based on tree boosting (XGB). These combinations are listed in <xref ref-type="table" rid="table2">Table 2</xref>, with R being the relevance of documents and U their understandability estimation. Although the definitions of learning to rank (LTR) 1 and LTR 2 are straightforward, the other methods deserve some further explanation. In LTR 3, a penalty was proportionally assigned to documents according to their understandability score U. For example, for CLEF 2016, a document with understandability U=0 received no penalty, as 0 was the easiest level of understanding, whereas another with understandability 50 received a 50% penalty, meaning that its relevance score was halved. LTRs 4 and 5 were based on a fixed threshold applied to the understandability score: if the score was higher than the threshold (U=2 for CLEF 2015 and U=40 for CLEF 2016), then the original relevance score (for LTR 4) or a boosted value (for LTR 5) was assigned to the corresponding document. We used the thresholds U=2 for CLEF 2015 and U=40 for CLEF 2016, based on the distribution of understandability assessments and the semantic of understandability labels [<xref ref-type="bibr" rid="ref44">44</xref>,<xref ref-type="bibr" rid="ref14">14</xref>].</p>
        </sec>
      <sec>
        <title>Evaluation Measures</title>
        <p>In the experiments, we used Pearson, Kendall, and Spearman correlations to compare the understandability assessments of human assessors with estimations obtained by the considered automated approaches, under all combinations of pipelines and heuristics. Pearson correlation is used to calculate the strength of the linear relationship between 2 variables, whereas Kendall and Spearman measure the rank correlations among the variables. We opted to report all 3 correlation coefficients to allow for a thorough comparison with other work, as they are equally used in the literature.</p>
        <p>For the retrieval experiments, we used evaluation measures that rely on both (topical) relevance and understandability. The uRBP measure [<xref ref-type="bibr" rid="ref42">42</xref>] extends rank biased precision (RBP) to situations where multiple relevance dimensions are used. The measure is formulated as uRBP(p)=(1-p) × ∑<sup>k</sup> p<sup>k-1</sup> × r(d@k) × u(d@k), where r(d@k) is the gain for retrieving a relevant document at rank k and u(d@k) is the gain for retrieving a document of a certain understandability at rank k; p is the RBP persistence parameter. This measure was an official evaluation measure used in CLEF (we also set <italic>P</italic>=.8).</p>
        
        
      
        <p>A drawback of uRBP is that relevance and understandability are combined into a unique evaluation score, thus making it difficult to interpret whether improvements are because of more understandable or more topical documents being retrieved. To overcome this, we used the multidimensional metric (MM) framework introduced by Palotti et al [<xref ref-type="bibr" rid="ref86">86</xref>] which first separately calculates an RBP value for relevance and another for understandability, and then combine them into a unique effectiveness measure:</p>
        <list list-type="bullet">
          <list-item>
            <p><italic>RBP</italic><sub><italic>r</italic> </sub><italic>@n(p)</italic>: uses the relevance assessments for the top <italic>n</italic> search results (ie, this is the common RBP). We regarded a document as topically relevant if assessed as somewhat relevant or highly relevant.</p>
          </list-item>
          <list-item>
            <p><italic>RBP</italic><sub><italic>u</italic> </sub><italic>@n(p)</italic>: uses the understandability assessments for the top <italic>n</italic> search results. We regarded a document as understandable (1) for CLEF 2015 if assessed easy or somewhat easy to understand and (2) for CLEF 2016 if its assessed understandability score was smaller than a threshold U. We used U=40, based on the distribution of understandability assessments. Assessors were presented with a slider for understandability assessment and U=50 was labeled as average understandability. This created a bimodal distribution of understandability assessments, with U=40 being a good upper limit for easy-to-read documents. The understandability distribution can be found in the <xref ref-type="app" rid="app2">Multimedia Appendix 2</xref>.</p>
          </list-item>
        </list>
        <list list-type="bullet">
          <list-item>
            <p><italic>MM</italic><sub><italic>RBP</italic> </sub><italic>@n(p)=2×(RBP</italic><sub><italic>r</italic> </sub><italic>@n×RBP</italic><sub><italic>u</italic> </sub><italic>@n)/(RBP</italic><sub><italic>r</italic> </sub><italic>@n+RBP</italic><sub><italic>u</italic> </sub><italic>@n)</italic>: combines the previous 2 RBP values into a unique measurement using the harmonic mean (in the same fashion that the F<sub>1</sub> measure combines recall and precision).</p>
          </list-item>
        </list>
        <p>For all measures, we set n=10 because shallow pools were used in CLEF along with measures that focused on the top 10 search results (including RBP<sub>r</sub>@10). Shallow pools refer to the selection of a limited number of documents to be assessed for relevance, among the documents retrieved at the top ranks by a search engine.</p>
        <p>Along with these measures of search effectiveness, we also recorded the number of unassessed documents, the RBP residuals, RBP<sup>*</sup><sub>r</sub>@10, RBP<sup>*</sup><sub>u</sub>@10, and MM<sup>*</sup><sub>RBP</sub>, that is, the corresponding measures calculated by ignoring unassessed documents. These latter measures implement the condensed measures approach proposed by Sakai as a way to deal with unassessed documents [<xref ref-type="bibr" rid="ref87">87</xref>]. We did this to minimize pool bias as the pools built in CLEF were of limited size and the investigated methods retrieved a substantial number of unassessed documents. Pool bias refers to the possible bias in the evaluation toward systems that have contributed documents to the assessment pool: these erroneously receive higher evaluation scores compared with systems that did not contribute to the pool (ie, that were not sampled to create the set of documents to be judged for relevance).</p>
      </sec>
    </sec>
    <sec sec-type="results">
      <title>Results</title>
      <sec>
        <title>Evaluation of Understandability Estimators</title>
        <p>To keep this paper succinct, in the following we only report a subset of the results. The remaining results (which show similar trends to those reported here) are made available in the <xref ref-type="app" rid="app3">Multimedia Appendix 3</xref> material for completeness.</p>
        <p>Using the CLEF eHealth 2015 and 2016 collections, we studied the correlations of methods to estimate Web page understandability compared with human assessments. For each category of understandability estimation, <xref ref-type="table" rid="table3">Tables 3</xref> and <xref ref-type="table" rid="table4">4</xref> report the methods with highest Pearson and Spearman or Kendall correlations for CLEF 2015 and 2016, respectively. For each method, we used the best preprocessing settings; a study of the impact of preprocessing is reported in the next subsection.</p>
        <p>Overall, Spearman and Kendall correlations obtained similar results (in terms of which methods exhibited the highest correlations): this was expected as, unlike Pearson, they are both rank-based correlations.</p>
        <p>For traditional RF, SMOG had the highest correlations for CLEF 2015 and DCI for CLEF 2016, regardless of correlation measure. These results resonate with those obtained for the category of raw components of readability formulae (CRF). In fact, the polysyllable words measure, which is the main feature used in SMOG, had the highest correlation for CLEF 2015 among methods in this category. Similarly, the number of difficult words, which is the main feature used in DCI, had the highest correlation for CLEF 2016 among methods in this category.</p>
        <p>When examining the expert vocabulary category (EMV), we found that the number of MeSH concepts obtained the highest correlations with human assessments; however, its correlations were substantially lower than those achieved by the best method from the consumer medical vocabularies category (CMV), that is, the scores of CHV concepts. For the natural language category (NLF), we found that the number of pronouns, the number of stop words, and the number of OV words had the highest correlations—and these were even higher than those obtained with MeSH- and CHV-based methods. In turn, the methods that obtained the highest correlations among the HTML category (HF) and counts of P tags and list tags exhibited overall the lowest correlations compared with methods in the other categories. P tags are used to create paragraphs in a Web page, being thus a rough proxy for text length. Among methods in the word frequency category (WFF), the use of Medical Reddit (but also of PubMed) showed the highest correlations, and these were comparable with those obtained by the RF.</p>
        <p>Finally, regressors (MLR) and classifiers (MLC) exhibited the highest correlations among all methods: in this category, the XGB regressor and the multinomial Naive Bayes best correlated with human assessments.</p>
        
        <table-wrap position="float" id="table3">
          <label>Table 3</label>
          <caption>
            <p>Methods with the highest correlation per category for Conference and Labs of the Evaluation Forum (CLEF) 2015.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="250"/>
            <col width="380"/>
            <col width="150"/>
            <col width="70"/>
            <col width="80"/>
            <col width="70"/>
            <thead>
              <tr valign="top">
                <td>Category</td>
                <td>Method</td>
                <td>Preprocessing</td>
                <td>Pearson</td>
                <td>Spearman</td>
                <td>Kendall</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>Readability formulae</td>
                <td>Simple Measure of Gobbledygook Index</td>
                <td>Jst Do Not Force Period (DNFP)</td>
                <td><italic>.438</italic><sup>a</sup></td>
                <td><italic>.388</italic></td>
                <td><italic>.286</italic></td>
              </tr>
              <tr valign="top">
                <td>Components of readability formulae (CRF)</td>
                <td>Average number of Polysyllables words per sentence</td>
                <td>Jst force period (FP)</td>
                <td><italic>.429</italic></td>
                <td>.364</td>
                <td>.268</td>
              </tr>
              <tr valign="top">
                <td>CRF</td>
                <td>Average number of Polysyllables words per sentence</td>
                <td>Jst DNFP</td>
                <td>.192</td>
                <td><italic>.388</italic></td>
                <td><italic>.286</italic></td>
              </tr>
              <tr valign="top">
                <td>General medical vocabularies (GMVs)</td>
                <td>Average number of medical prefixes per word</td>
                <td>Naïve FP</td>
                <td><italic>.314</italic></td>
                <td>.312</td>
                <td>.229</td>
              </tr>
              <tr valign="top">
                <td>GMVs</td>
                <td>Number of medical prefixes</td>
                <td>Naïve FP</td>
                <td>.131</td>
                <td><italic>.368</italic></td>
                <td><italic>.272</italic></td>
              </tr>
              <tr valign="top">
                <td>Consumer medical vocabulary (CMV)</td>
                <td>Consumer health vocabulary (CHV) mean score for all concepts</td>
                <td>Naïve FP</td>
                <td><italic>.371</italic></td>
                <td><italic>.314</italic></td>
                <td><italic>.228</italic></td>
              </tr>
              <tr valign="top">
                <td>Expert medical vocabulary (EMV)</td>
                <td>Number of medical concepts</td>
                <td>Naïve FP</td>
                <td><italic>.227</italic></td>
                <td><italic>.249</italic></td>
                <td><italic>.178</italic></td>
              </tr>
              <tr valign="top">
                <td>Natural language features (NLF)</td>
                <td>Number of words not found in Aspell dictionary</td>
                <td>Jst DNFP</td>
                <td><italic>.351</italic></td>
                <td>.276</td>
                <td>.203</td>
              </tr>
              <tr valign="top">
                <td>NLF</td>
                <td>Number of pronouns per word</td>
                <td>Naïve FP</td>
                <td>.271</td>
                <td><italic>.441</italic></td>
                <td><italic>.325</italic></td>
              </tr>
              <tr valign="top">
                <td>HTML features (HF)</td>
                <td>Number of P tags</td>
                <td>None</td>
                <td><italic>.219</italic></td>
                <td><italic>.196</italic></td>
                <td><italic>.142</italic></td>
              </tr>
              <tr valign="top">
                <td>Word frequency features (WFF)</td>
                <td>Mean rank Medical Reddit</td>
                <td>Jst DNFP</td>
                <td><italic>.435</italic></td>
                <td>.277</td>
                <td>.197</td>
              </tr>
              <tr valign="top">
                <td>WFF</td>
                <td>25th percentile Pubmed</td>
                <td>Jst DNFP</td>
                <td>.330</td>
                <td><italic>.347</italic></td>
                <td><italic>.256</italic></td>
              </tr>
              <tr valign="top">
                <td>Machine learning regressors (MLR)</td>
                <td>eXtreme Gradient Boosting (XGB) Regressor</td>
                <td>Boi DNFP</td>
                <td><italic>.602</italic></td>
                <td>.394</td>
                <td>.287</td>
              </tr>
              <tr valign="top">
                <td>MLR</td>
                <td>XGB Regressor</td>
                <td>Jst FP</td>
                <td>.565</td>
                <td><italic>.438</italic></td>
                <td><italic>.324</italic></td>
              </tr>
              <tr valign="top">
                <td>Machine learning classifiers</td>
                <td>Multinomial Naïve Bayes</td>
                <td>Naïve FP</td>
                <td><italic>.573</italic></td>
                <td><italic>.477</italic></td>
                <td><italic>.416</italic></td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table3fn1">
              <p><sup>a</sup>Italics used to highlight the best result of each group.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <table-wrap position="float" id="table4">
          <label>Table 4</label>
          <caption>
            <p>Methods with the highest correlation per category for Conference and Labs of the Evaluation Forum (CLEF) 2016.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="250"/>
            <col width="380"/>
            <col width="150"/>
            <col width="70"/>
            <col width="80"/>
            <col width="70"/>
            <thead>
              <tr valign="top">
                <td>Category</td>
                <td>Method</td>
                <td>Preprocessing</td>
                <td>Pearson</td>
                <td>Spearman</td>
                <td>Kendall</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="bottom">
                <td>Readability formulae (RF)</td>
                <td>Dale-Chall Index (DCI)</td>
                <td>Jst force period (FP)</td>
                <td><italic>.439</italic><sup>a</sup></td>
                <td>.381</td>
                <td><italic>.264</italic></td>
              </tr>
              <tr valign="top">
                <td>RF</td>
                <td>DCI</td>
                <td>Boi FP</td>
                <td>.437</td>
                <td><italic>.382</italic></td>
                <td><italic>.264</italic></td>
              </tr>
              <tr valign="top">
                <td>Components of readability formulae (CRF)</td>
                <td>Average number of difficult word per Word</td>
                <td>Boi FP</td>
                <td><italic>.431</italic></td>
                <td><italic>.379</italic></td>
                <td><italic>.262</italic></td>
              </tr>
              <tr valign="top">
                <td>General medical vocabularies (GMVs)</td>
                <td>Average prefixes per sentence</td>
                <td>Jst FP</td>
                <td><italic>.263</italic></td>
                <td>.242</td>
                <td>.164</td>
              </tr>
              <tr valign="top">
                <td>GMVs</td>
                <td>International Statistical Classification of Diseases and Related Health Problems concepts per sentence</td>
                <td>Jst do not force period (DNFP)</td>
                <td>.014</td>
                <td><italic>.253</italic></td>
                <td><italic>.172</italic></td>
              </tr>
              <tr valign="top">
                <td>Consumer medical vocabulary (CMV)</td>
                <td>Consumer health vocabulary (CHV) mean score for all concepts</td>
                <td>Jst FP</td>
                <td><italic>.329</italic></td>
                <td>.313</td>
                <td>.216</td>
              </tr>
              <tr valign="top">
                <td>CMV</td>
                <td>CHV mean score for all concepts</td>
                <td>Boi FP</td>
                <td><italic>.329</italic></td>
                <td><italic>.325</italic></td>
                <td><italic>.224</italic></td>
              </tr>
              <tr valign="top">
                <td>EMV</td>
                <td>Number of MeSH (Medical Subject Headings) concepts</td>
                <td>Boi DNFP</td>
                <td><italic>.201</italic></td>
                <td>.166</td>
                <td>.113</td>
              </tr>
              <tr valign="top">
                <td>Expert medical vocabulary (EMV)</td>
                <td>Number of MeSH disease concepts</td>
                <td>Boi DNFP</td>
                <td>.179</td>
                <td><italic>.192</italic></td>
                <td><italic>.132</italic></td>
              </tr>
              <tr valign="top">
                <td>Natural language features (NLF)</td>
                <td>Average stop word per word</td>
                <td>Boi FP</td>
                <td><italic>.344</italic></td>
                <td>.312</td>
                <td>.213</td>
              </tr>
              <tr valign="top">
                <td>NLF</td>
                <td>Number of pronouns</td>
                <td>Boi FP</td>
                <td>.341</td>
                <td><italic>.364</italic></td>
                <td><italic>.252</italic></td>
              </tr>
              <tr valign="top">
                <td>HTML features (HF)</td>
                <td>Number of lists</td>
                <td>None</td>
                <td><italic>.114</italic></td>
                <td>.021</td>
                <td>.015</td>
              </tr>
              <tr valign="top">
                <td>HF</td>
                <td>Number of P tags</td>
                <td>None</td>
                <td>.110</td>
                <td><italic>.123</italic></td>
                <td><italic>.084</italic></td>
              </tr>
              <tr valign="top">
                <td>Word frequency features (WFF)</td>
                <td>Mean rank Medical Reddit</td>
                <td>Boi DNFP</td>
                <td><italic>.387</italic></td>
                <td>.312</td>
                <td>.214</td>
              </tr>
              <tr valign="top">
                <td>WFF</td>
                <td>50th percentile Medical Reddit</td>
                <td>Jst DNFP</td>
                <td>.351</td>
                <td><italic>.315</italic></td>
                <td><italic>.216</italic></td>
              </tr>
              <tr valign="top">
                <td>Machine learning regressors (MLR)</td>
                <td>eXtreme Gradient Boosting (XGB) Regressor</td>
                <td>Jst DNFP</td>
                <td><italic>.454</italic></td>
                <td><italic>.373</italic></td>
                <td>.258</td>
              </tr>
              <tr valign="top">
                <td>MLR</td>
                <td>Random Forest Regressor</td>
                <td>Boi DNFP</td>
                <td>.389</td>
                <td>.355</td>
                <td><italic>.264</italic></td>
              </tr>
              <tr valign="top">
                <td>Machine learning classifiers</td>
                <td>Multinomial Naïve Bayes</td>
                <td>Jst FP</td>
                <td><italic>.461</italic></td>
                <td><italic>.391</italic></td>
                <td><italic>.318</italic></td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table4fn1">
              <p><sup>a</sup>Italics used to highlight the best result of each group.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        
      </sec>
      <sec>
        <title>Evaluation of Preprocessing Pipelines and Heuristics</title>
        <p>Results from experiments with different preprocessing pipelines and heuristics are shown in <xref ref-type="fig" rid="figure2">Figures 2</xref> and <xref ref-type="fig" rid="figure3">3</xref>, respectively for CLEF 2015 and 2016. For each category of methods and combination of preprocessing and heuristics, we report their variability in terms of Spearman rank correlation with human assessments. Results for Pearson and Kendall correlations are reported in the <xref ref-type="app" rid="app3">Multimedia Appendix 3</xref>, but showed similar trends. We further report the summary results across all understandability assessment methods and sentence-ending heuristics for each of the preprocessing pipelines. Finally, we also report the interassessor correlation (last box) when multiple assessors provided judgments about the understandability of Web pages. This provides an indication of the range of variability and subjectiveness when assessing understandability, along with the highest correlation we measured between human assessors.</p>
        <p>We first examined the correlations between human assessments and RF. We found that the <italic>Naive</italic> preprocessing resulted in the lowest correlations, regardless of RF and heuristic (although <italic>DoNotForcePeriod</italic> performed better than <italic>ForcePeriod</italic>). Using Justext or Boilerplate resulted in higher correlations with human understandability assessments, and the <italic>ForcePeriod</italic> heuristic was shown to be better than <italic>DoNotForcePeriod</italic>. These results confirm our hypotheses in Palotti et al [<xref ref-type="bibr" rid="ref20">20</xref>]: we found these settings to produce lower variances in understandability estimations, and thus hypothesized that they were better suited to the task.</p>
        <p>Overall, among RF, the best results (highest correlations) were obtained by SMOG and DCI (see also <xref ref-type="table" rid="table3">Tables 3</xref> and <xref ref-type="table" rid="table4">4</xref>). Although no single setting outperformed the others in both collections, we found that the use of CLI and FRE with <italic>Justext</italic> provided the most stable results across the collections, with correlations as high as the best ones in both collections. These results confirmed our previous advice [<xref ref-type="bibr" rid="ref20">20</xref>], that is, in general, if using readability measures, CLI is to be preferred, along with an appropriate HTML extraction pipeline, regardless of the heuristic for sentence ending. We provide detailed plots to compare the results in this paper with those in Palotti et al [<xref ref-type="bibr" rid="ref20">20</xref>] in the <xref ref-type="app" rid="app4">Multimedia Appendix 4</xref>.</p>
        <p>When considering methods beyond those based on RF, we found that the highest correlations were achieved by the regressors (MLR) and classifiers (MLC), independently of the preprocessing method used. There is little difference in terms of effectiveness of methods in these categories, with the exception of regressors on CLEF 2015 that exhibited not negligible variances: whereas for the neural network regressor the Pearson correlation was .44 and for the support vector regressor it was only .30.</p>
        
        <fig id="figure2" position="float">
          <label>Figure 2</label>
          <caption>
            <p>Correlations between understandability estimators and human assessments for Conference and Labs of the Evaluation Forum 2015. For example, the first boxplot on the top represents the distribution of Spearman correlations with human assessments across all features in the category readability formulae, obtained with the Naive Force Period preprocessing. Each box extends from the lower to the upper quartile values, with the red marker representing the median value for that category. Whiskers show the range of the data in each category and circles represent values considered outliers for the category (eg, Spearman correlation for Simple Measure of Gobbledygook (SMOG) index was .296 and for Automated Readability Index (ARI) was .194: these were outliers for that category). CMV: consumer medical vocabulary; CRF: components of readability formulae; DNFP: Do Not Force Period; EMV: expert medical vocabulary; FP: Force Period; GMV: general medical vocabulary; MLC: machine learning classifiers; MLR: machine learning regressors; NLF: natural language features; RF: readability formulae; WFF: word frequency features.</p>
          </caption>
          <graphic xlink:href="jmir_v21i1e10986_fig2.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <fig id="figure3" position="float">
          <label>Figure 3</label>
          <caption>
            <p>Correlations between understandability estimators and human assessments for Conference and Labs of the Evaluation Forum (CLEF) 2016. CMV: consumer medical vocabulary; CRF: components of readability formulae; DNFP: Do Not Force Period; EMV: expert medical vocabulary; FP: Force Period; GMV: general medical vocabulary; MLC: machine learning classifiers; MLR: machine learning regressors; NLF: natural language features; RF: readability formulae; WFF: word frequency features.</p>
          </caption>
          <graphic xlink:href="jmir_v21i1e10986_fig3.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        
        <p>A common trend when comparing preprocessing pipelines is that the Naïve pipeline provided the weakest correlations with human assessments for CLEF 2016, regardless of estimation methods and heuristics. This result, however, was not confirmed for CLEF 2015, where the Naive preprocessing negatively influenced correlations for the RF category, but not for other categories, although it was generally associated with larger variances for the correlation coefficients.</p>
        
      </sec>
      <sec>
        <title>Evaluation of Understandability Retrieval</title>
        <sec>
          <title>Reranking Experiments</title>
          <p>Results for the considered retrieval methods are reported in <xref ref-type="fig" rid="figure4">Figures 4</xref>-<xref ref-type="fig" rid="figure8">8</xref>. We report only the results for CLEF 2016 for brevity; those for CLEF 2015 exhibited similar trends and are included in the <xref ref-type="app" rid="app5">Multimedia Appendix 5</xref>. When reranking results, we risk bringing to the top position a document that was never assessed. The RBP residuals (shown in gray in <xref ref-type="fig" rid="figure3">Figures 3</xref>-<xref ref-type="fig" rid="figure8">8</xref>) show the possible gains that unassessed documents can have on the evaluation, as it assumed that all unassessed documents are relevant. Another way to quantify the effect that unassessed documents have on evaluation is looking at the average number of unassessed documents in the top 10 results: this is given by the metric <italic>Unj@10</italic>. Larger values of <italic>Unj@10</italic> imply that actual effectiveness might be noticeably larger. Here, we also show the values for the condensed measures.</p>
          <p>The effectiveness of the top 2 submissions to CLEF 2016 and the BM25 baseline are reported in <xref ref-type="fig" rid="figure4">Figure 4</xref>. In turn, we report the results of each subexperiment: <italic>simple reranking</italic> (<xref ref-type="fig" rid="figure5">Figures 5</xref> and <xref ref-type="fig" rid="figure6">6</xref>), <italic>fusion experiments</italic> (<xref ref-type="fig" rid="figure7">Figure 7</xref>), and <italic>learning to rank</italic> (<xref ref-type="fig" rid="figure8">Figure 8</xref>).</p>
          <fig id="figure4" position="float">
            <label>Figure 4</label>
            <caption>
              <p>Baseline results for the best 2 submissions to Conference and Labs of the Evaluation Forum (CLEF) 2016 (Georgetown University Information Retrieval [GUIR] and East China Normal University [ECNU]) and the Best Match 25 (BM25) baseline of Terrier. MM: multidimensional metric; RBP: rank biased precision.</p>
            </caption>
            <graphic xlink:href="jmir_v21i1e10986_fig4.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </fig>
          <fig id="figure5" position="float">
            <label>Figure 5</label>
            <caption>
              <p>Reranking of the runs based on the Dale-Chall readability formula. ECNU: East China Normal University; GUIR: Georgetown University Information Retrieval; MM: multidimensional metric; RBP: rank biased precision.</p>
            </caption>
            <graphic xlink:href="jmir_v21i1e10986_fig5.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </fig>
          <fig id="figure6" position="float">
            <label>Figure 6</label>
            <caption>
              <p>Reranking of the runs based on the eXtreme Gradient Boosting (XGB) regressor to estimate understandability. ECNU: East China Normal University; GUIR: Georgetown University Information Retrieval; MM: multidimensional metric; RBP: rank biased precision.</p>
            </caption>
            <graphic xlink:href="jmir_v21i1e10986_fig6.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </fig>
          <fig id="figure7" position="float">
            <label>Figure 7</label>
            <caption>
              <p>Reranking combining topical relevance (original run) and understandability (eXtreme Gradient Boosting [XGB]) through rank fusion. ECNU: East China Normal University; GUIR: Georgetown University Information Retrieval; MM: multidimensional metric; RBP: rank biased precision.</p>
            </caption>
            <graphic xlink:href="jmir_v21i1e10986_fig7.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </fig>
          <fig id="figure8" position="float">
            <label>Figure 8</label>
            <caption>
              <p>Results of the learning to rank (LTR) method on the Best Match 25 (BM25) baseline. The BM25 baseline (light blue) is shown for direct comparison. MM: multidimensional metric; RBP: rank biased precision.</p>
            </caption>
            <graphic xlink:href="jmir_v21i1e10986_fig8.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </fig>
        </sec>
        <sec>
          <title>Simple Reranking</title>
          <p><xref ref-type="fig" rid="figure5">Figure 5</xref> reports the results of reranking methods applied to the runs shown in <xref ref-type="fig" rid="figure4">Figure 4</xref>. Reranking was applied based on the DCI score of each document calculated using the preprocessing combination of Boilerpipe and ForcePeriod (best according to Pearson correlation, from <xref ref-type="table" rid="table3">Tables 3</xref> and <xref ref-type="table" rid="table4">4</xref>). We found that the relevance of the reranked runs (as measured by RBP<sub>r</sub> and RBP<sup>*</sup><sub>r</sub>) significantly decreased, compared with the original runs, for example, reranking the top 15 search results using DCI made RBP<sub>r</sub> decrease from 25.28 to 21.58. However, as expected, these reranked results were significantly more understandable: for the previous example, RBP<sub>u</sub> passed from 42.08 to 47.09.</p>
          <p>In the experiments, we also studied the influence of the number of documents considered for reranking (cut-off). The top/middle/bottom plots of <xref ref-type="fig" rid="figure5">Figure 5</xref> refer to reranking only the top k=15/20/50 documents from the original runs. The results show that the more documents are considered for reranking, the more degradation in RBP<sub>r</sub> effectiveness. Considering understandability only in the evaluation shows mixed results. Similar trends were observed for evaluation measures that consider understandability (RBP and RBP<sub>u</sub>), however, with some exceptions. For example, an increase in uRBP was observed when reranking ECNU using the top 50 results.</p>
          <p>Note that with the increase of the number of documents considered for reranking, there is an increase in the number of unassessed documents being considered by the evaluation measures. Nevertheless, we note that if unassessed documents are excluded from the evaluation, similar trends are observed, for example, compare findings with those for the condensed measures uRBP<sup>*</sup>, RBP<sup>*</sup><sub>r</sub>, RBP<sup>*</sup><sub>u</sub>, and MM<sup>*</sup><sub>RBP</sub>.</p>
          <p><xref ref-type="fig" rid="figure6">Figure 6</xref> refers to using a machine learning method, XGB regressor (<xref ref-type="boxed-text" rid="box9">Textbox 9</xref>), to estimate understandability. Similarly, when using DCI, as the cut-off increased, for example, from k=15 to k=50, the documents returned were more understandable but less relevant. For the same cut-off value, for example, k=15, the machine learning method used for estimating understandability consistently yielded more understandable results than DCI (higher RBP<sub>u</sub> and RBP<sup>*</sup><sub>u</sub>).</p>
          <p>Overall, statistically significant improvements over the baselines were observed for most configurations and measures.</p>
        </sec>
        <sec>
          <title>Rank Fusion</title>
          <p>Next, we report the results of automatically combining topical relevance and understandability through rank fusion in <xref ref-type="fig" rid="figure7">Figure 7</xref>. We used the XGB method for estimating understandability, as it was the one yielding highest effectiveness for the reranking method. Runs were thus produced by fusing the reranking with XGB and the original run. (Results for DCI are reported in the <xref ref-type="app" rid="app5">Multimedia Appendix 5</xref> and confirm the superiority of XGB.)</p>
          <p>As for reranking, also for the rank fusion approaches we found that, in general, higher cut-offs were associated to higher effectiveness in terms of understandability measures on one hand, but higher losses in terms of relevance-oriented measures on the other. Overall, results obtained with rank fusion were superior to those obtained with reranking only, although most differences were not statistically significant. Statistically significant improvements over the baselines were instead observed for most configurations and measures.</p>
        </sec>
        <sec>
          <title>Learning to Rank</title>
          <p>Finally, we analyze the results obtained by the learning to rank methods in <xref ref-type="fig" rid="figure8">Figure 8</xref>. Unlike with the previous methods, we did not impose a rank cut-off on learning to rank. Learning to rank was only applied to the BM25 baseline, as we had no access to the IR features for the runs submitted at CLEF (ie, GUIR and ECNU for CLEF 2016). BM25 baseline (<xref ref-type="fig" rid="figure4">Figure 4</xref>) is also shown in <xref ref-type="fig" rid="figure8">Figure 8</xref> for an easy and direct comparison.</p>
          <p>When considering RBP<sub>r</sub> and uRBP, learning to rank exhibited effectiveness that was significantly inferior to that of the GUIR and ECNU baseline runs, although higher than those for the BM25 baseline (for some configurations). The examination of the number of unassessed documents (and the RBP residuals, see <xref ref-type="app" rid="app5">Multimedia Appendix 5</xref>) revealed that this might have been because measures were affected by the large number of unassessed documents retrieved in the top 10 ranks. For example, the RBP<sub>r</sub> residual for learning to rank methods was about double that of the baselines or other approaches (see <xref ref-type="app" rid="app5">Multimedia Appendix 5</xref>). In fact, among the documents retrieved in the top 10 results by learning to rank, there were 20% (2/10) that were unassessed, compared with an average of 3% (0.3/10) for the other methods (excluding XGB with cutoff 50, which also exhibited high residuals).</p>
          <p>We thus should carefully account for unassessed documents through considering the residuals of RBP measures as well as the condensed measures. When this was done, we observed that learning to rank methods overall provided substantial gains over the original runs and other methods (when considering RBP<sup>*</sup><sub>r</sub>, RBP<sup>*</sup><sub>u</sub>, and MM<sup>*</sup><sub>RBP</sub>), or large potential gains over these methods (when considering the residuals). Next, we analyzed these results in more detail.</p>
          <p>No improvements over the baselines were found for LTR 1, and the high residuals for RBP<sub>r</sub> were not matched by other residuals or by considering only assessed documents (see <xref ref-type="app" rid="app5">Multimedia Appendix 5</xref>). LTR 1 was a simple method that used only IR features and was trained only on topical relevance. Specifically, we devised 24 IR features using the Terrier framework. The score of various retrieval models was extracted from a multifield index composed of title, body, and whole document. Although simple, this is a typical learning to rank setting.</p>
          <p>Compared with LTR 1, LTR 2 included the understandability features listed in <xref ref-type="boxed-text" rid="box1">Textboxes 1</xref>-<xref ref-type="boxed-text" rid="box10">10</xref>. This inclusion was as beneficial to the understandability measures as to the relevance measures, with RBP<sup>*</sup><sub>r</sub>, RBP<sup>*</sup><sub>u</sub>, and MM<sup>*</sup><sub>RBP</sub> all showing gains over the baselines. LTR 3 obtained similar MM<sup>*</sup><sub>RBP</sub> values, although with higher effectiveness for relevance measures (RBP<sup>*</sup><sub>r</sub>) than for understandability (RBP<sup>*</sup><sub>u</sub>).</p>
          <p>LTRs 4 and 5 were devised based on a set understandability threshold U=40. Although LTR 4 took into consideration only documents that were easy to read (understandability label≤U), LTR 5 considered all documents, but boosted the relevance score. LTR 4 reached the highest understandability score for the learning-to-rank approaches (RBP<sup>*</sup><sub>u</sub>=50.06), but it failed to retrieve a substantial number of relevant documents (RBP<sup>*</sup><sub>r</sub>=2.20). In turn, LTR 5 reached the highest understandability-relevance trade-off (MM<sup>*</sup><sub>RBP</sub>=29.20). Compared with the BM25 baseline (on which it was based), LTR 5 largely increased both relevance (RBP<sup>*</sup><sub>r</sub> from 26.01 to 32.60—a 25% increase, <italic>P</italic><sub>bl</sub>=.003) and understandability (RBP<sup>*</sup><sub>u</sub> from 43.89 to 45.87 — a 4% increase, <italic>P</italic><sub>bl</sub>&lt;.001). Note that LTR 5 was also significantly better than the best run submitted to CLEF 2016 for both RBP<sup>*</sup><sub>r</sub> (15% increase, <italic>P</italic><sub>g</sub>=.120) and MM<sup>*</sup><sub>RBP</sub> (13% increase, <italic>P</italic><sub>g</sub>=.001).</p>
        </sec>
      </sec>
    </sec>
    <sec sec-type="discussion">
      <title>Discussion</title>
      <sec>
        <title>Principal Findings</title>
        <p>The empirical experiments suggested the following:</p>
        <list list-type="bullet">
          <list-item>
            <p>Machine learning methods based on regression are best suited to estimate the understandability of health Web pages</p>
          </list-item>
          <list-item>
            <p>Preprocessing does affect effectiveness (both for understandability prediction and document retrieval), although compared with other methods, ML-based methods for understandability estimation are less subjected to variability caused by poor preprocessing</p>
          </list-item>
          <list-item>
            <p>Learning to rank methods can be specifically trained to promote more understandable search results, whereas still providing an effective trade-off with topical relevance.</p>
          </list-item>
        </list>
      </sec>
      <sec>
        <title>Limitations</title>
        <p>In this study, we relied on data collected through the CLEF 2015 and CLEF 2016 evaluation efforts to evaluate the effectiveness of methods that estimate the understandability of the Web pages. These assessments were obtained by asking medical experts and practitioners to rate documents; although, they were asked to estimate the understandability of the content as if they were the patients they treat, there might have been noise and imprecisions in the collection mechanism because of the subjectivity of the task. <xref ref-type="fig" rid="figure2">Figure 2</xref> highlights this by showing that the agreement between assessors is relatively low. A better setting might have been to directly recruit health consumers: the task would still have been subjective but would have captured real ratings, rather than inferred or perceived ratings. Despite this, our previous work has shown that no substantial differences were found in the downstream evaluation of retrieval systems, when we acquired understandability assessments from health consumers for a subset of the CLEF 2015 collection [<xref ref-type="bibr" rid="ref46">46</xref>].</p>
        <p>Relevance assessments on the CLEF 2015 and 2016 collections are incomplete [<xref ref-type="bibr" rid="ref44">44</xref>,<xref ref-type="bibr" rid="ref14">14</xref>], that is, not all top ranked Web pages retrieved by the investigated methods have an explicit relevance assessment. This is often the case in information retrieval, where the validity of experiments based on incomplete assessments has been thoroughly investigated [<xref ref-type="bibr" rid="ref88">88</xref>]. Nonetheless, we carefully controlled for the impact that unassessed documents had in our experiments by measuring their number and using measures such as RBP that account for residuals and condensed variants. The residuals analysis has been reported in the appendix.</p>
      </sec>
      <sec>
        <title>Conclusions</title>
        <p>We have examined approaches to estimate the understandability of health Web pages, including the impact of HTML preprocessing techniques and how to integrate these within retrieval methods to provide more understandable search results for people seeking health information. We found that machine learning methods are better suited than traditionally employed readability measures for assessing the understandability of health-related Web pages and that learning to rank is the most effective strategy to integrate this into retrieval. We also found that HTML and text preprocessing do affect the effectiveness of both understandability estimations and of the retrieval process, although machine learning methods are less sensitive to this issue.</p>
        <p>This paper contributes to improving search engines tailored to consumer health search because it thoroughly investigates promises and pitfalls of understandability estimations and their integration into retrieval methods. The paper further highlights which methods and settings should be used to provide better search results to health information seekers. As shown in <xref ref-type="fig" rid="figure1">Figure 1</xref>, these methods would clearly improve current health-focused search engines.</p>
        <p>The methods investigated here do not provide a fully personalized search, with respect to how much of the health content consumers with different health knowledge might be able to understand. Instead, we focus on making the results understandable by anyone, and thus promote in the search results content that has the highest level of understandability. However, people with a more than average medical knowledge might benefit higher from more specialized content. We leave this personalization aspect, that is, the tailoring of the understandability level of the promoted content with respect to the user’s knowledge and abilities, to further work.</p>
      </sec>
    </sec>
  </body>
  <back>
    <app-group>
      <app id="app1">
        <title>Multimedia Appendix 1</title>
        <p>The impact of feature sets on the Spearman correlation between the predicted understandability and the ground truth assessed by human assessors in Conference and Labs of the Evaluation Forum (CLEF) eHealth 2015.</p>
        <media xlink:href="jmir_v21i1e10986_app1.pdf" xlink:title="PDF File (Adobe PDF File), 627KB"/>
      </app>
      <app id="app2">
        <title>Multimedia Appendix 2</title>
        <p>Distribution of Understandability Scores for Conference and Labs of the Evaluation Forum (CLEF) 2016.</p>
        <media xlink:href="jmir_v21i1e10986_app2.pdf" xlink:title="PDF File (Adobe PDF File), 726KB"/>
      </app>
      <app id="app3">
        <title>Multimedia Appendix 3</title>
        <p>Correlations between understandability estimators and human assessments for Conference and Labs of the Evaluation Forum (CLEF) 2015 and CLEF 2016.</p>
        <media xlink:href="jmir_v21i1e10986_app3.pdf" xlink:title="PDF File (Adobe PDF File), 896KB"/>
      </app>
      <app id="app4">
        <title>Multimedia Appendix 4</title>
        <p>Correlation results of different readability formulae with human assessments from Conference and Labs of the Evaluation Forum (CLEF) eHealth 2015 and 2016.</p>
        <media xlink:href="jmir_v21i1e10986_app4.pdf" xlink:title="PDF File (Adobe PDF File), 974KB"/>
      </app>
      <app id="app5">
        <title>Multimedia Appendix 5</title>
        <p>Results obtained by integrating understandability estimations within retrieval methods on Conference and Labs of the Evaluation Forum (CLEF) 2015 and CLEF 2016.</p>
        <media xlink:href="jmir_v21i1e10986_app5.pdf" xlink:title="PDF File (Adobe PDF File), 1MB"/>
      </app>
    </app-group>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">BM25</term>
          <def>
            <p>Best Match 25</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb2">CHV</term>
          <def>
            <p>consumer health vocabulary</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb3">CLEF</term>
          <def>
            <p>Conference and Labs of the Evaluation Forum</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb4">CLI</term>
          <def>
            <p>Coleman-Liau Index</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb5">CMV</term>
          <def>
            <p>consumer medical vocabulary</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb6">CRF</term>
          <def>
            <p>components of readability formulae</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb7">DCI</term>
          <def>
            <p>Dale-Chall Index</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb8">ECNU</term>
          <def>
            <p>East China Normal University</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb9">eHealth</term>
          <def>
            <p>electronic health</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb10">EMV</term>
          <def>
            <p>expert medical vocabulary</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb11">FRE</term>
          <def>
            <p>Flesch Reading Ease</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb12">GMV</term>
          <def>
            <p>general medical vocabulary</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb13">GUIR</term>
          <def>
            <p>Georgetown University Information Retrieval</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb14">HF</term>
          <def>
            <p>HTML features</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb15">HON</term>
          <def>
            <p>HealthOnNet</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb16">ICD</term>
          <def>
            <p>International Statistical Classification of Diseases and Related Health Problems</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb17">KISTI</term>
          <def>
            <p>Korean Institute of Science and Technology Information</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb18">LTR</term>
          <def>
            <p>learning to rank</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb19">MeSH</term>
          <def>
            <p>Medical Subject Headings</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb20">MLC</term>
          <def>
            <p>machine learning classifiers</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb21">MLR</term>
          <def>
            <p>machine learning regressors</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb22">MM</term>
          <def>
            <p>multidimensional metric</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb23">NLF</term>
          <def>
            <p>natural language features</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb24">OV</term>
          <def>
            <p>out-of-vocabulary</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb25">POS</term>
          <def>
            <p>part of speech</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb26">RBP</term>
          <def>
            <p>rank biased precision</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb27">RF</term>
          <def>
            <p>readability formulae</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb28">SMOG</term>
          <def>
            <p>Simple Measure of Gobbledygook</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb29">WFF</term>
          <def>
            <p>word frequency features</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb30">XGB</term>
          <def>
            <p>eXtreme Gradient Boosting</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <ack>
      <p>The authors acknowledge the Technische Universität Wien (TU Wien) University Library for financial support through its Open Access Funding Programme. GZ is the recipient of an Australian Research Council Discovery Early Career Researcher Award (DECRA) Research Fellowship (DE180101579) and a Google Faculty Award. This project has received funding from the European Union’s Horizon 2020 research and innovation programme under grant agreement No 644753 (KConnect).</p>
    </ack>
    <fn-group>
      <fn fn-type="conflict">
        <p>None declared.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="confproc">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Zhang</surname>
            <given-names>Y</given-names>
          </name>
          <name name-style="western">
            <surname>Zhang</surname>
            <given-names>J</given-names>
          </name>
          <name name-style="western">
            <surname>Lease</surname>
            <given-names>M</given-names>
          </name>
          <name name-style="western">
            <surname>Gwizdka</surname>
            <given-names>J</given-names>
          </name>
        </person-group>
        <article-title>Multidimensional relevance modeling via psychometrics and crowdsourcing</article-title>
        <year>2014</year>  
        <conf-name>Proceedings of the 37th international ACM SIGIR conference on Research &amp; development in information retrieval</conf-name>
        <conf-date>July 06 - 11, 2014</conf-date>
        <conf-loc>Gold Coast, Queensland, Australia</conf-loc>
        <publisher-name>ACM</publisher-name>
        <fpage>435</fpage>  
        <lpage>444</lpage>  
        <pub-id pub-id-type="doi">10.1145/2600428.2609577</pub-id></nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Fergus</surname>
            <given-names>TA</given-names>
          </name>
          <name name-style="western">
            <surname>Spada</surname>
            <given-names>MM</given-names>
          </name>
        </person-group>
        <article-title>Cyberchondria: examining relations with problematic internet use and metacognitive beliefs</article-title>
        <source>Clin Psychol Psychother</source>  
        <year>2017</year>  
        <month>11</month>  
        <volume>24</volume>  
        <issue>6</issue>  
        <fpage>1322</fpage>  
        <lpage>1330</lpage>  
        <pub-id pub-id-type="doi">10.1002/cpp.2102</pub-id>
        <pub-id pub-id-type="medline">28621035</pub-id></nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Diviani</surname>
            <given-names>N</given-names>
          </name>
          <name name-style="western">
            <surname>van den Putte</surname>
            <given-names>B</given-names>
          </name>
          <name name-style="western">
            <surname>Giani</surname>
            <given-names>S</given-names>
          </name>
          <name name-style="western">
            <surname>van Weert</surname>
            <given-names>JC</given-names>
          </name>
        </person-group>
        <article-title>Low health literacy and evaluation of online health information: a systematic review of the literature</article-title>
        <source>J Med Internet Res</source>  
        <year>2015</year>  
        <month>05</month>  
        <day>07</day>  
        <volume>17</volume>  
        <issue>5</issue>  
        <fpage>e112</fpage>  
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="http://www.jmir.org/2015/5/e112/"/>
        </comment>  
        <pub-id pub-id-type="doi">10.2196/jmir.4018</pub-id>
        <pub-id pub-id-type="medline">25953147</pub-id>
        <pub-id pub-id-type="pii">v17i5e112</pub-id>
        <pub-id pub-id-type="pmcid">PMC4468598</pub-id></nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>White</surname>
            <given-names>R</given-names>
          </name>
          <name name-style="western">
            <surname>Horvitz</surname>
            <given-names>E</given-names>
          </name>
        </person-group>
        <article-title>Cyberchondria: Studies of the escalation of medical concerns in Web search</article-title>
        <source>ACM Trans Inf Syst</source>  
        <year>2009</year>  
        <month>11</month>  
        <day>01</day>  
        <volume>27</volume>  
        <issue>4</issue>  
        <fpage>1</fpage>  
        <lpage>37</lpage>  
        <pub-id pub-id-type="doi">10.1145/1629096.1629101</pub-id></nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="confproc">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>White</surname>
            <given-names>R</given-names>
          </name>
        </person-group>
        <article-title>Beliefs and Biases in Web Search</article-title>
        <year>2013</year>  
        <conf-name>Proceedings of the 36th International ACM SIGIR Conference on Research and Development in Information Retrieval</conf-name>
        <conf-date>July 28 - August 01, 2013</conf-date>
        <conf-loc>Dublin, Ireland</conf-loc>
        <publisher-name>ACM</publisher-name>
        <fpage>3</fpage>  
        <lpage>12</lpage>  
        <pub-id pub-id-type="doi">10.1145/2484028.2484053</pub-id></nlm-citation>
      </ref>
      <ref id="ref6">
        <label>6</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Graber</surname>
            <given-names>MA</given-names>
          </name>
          <name name-style="western">
            <surname>Roller</surname>
            <given-names>CM</given-names>
          </name>
          <name name-style="western">
            <surname>Kaeble</surname>
            <given-names>B</given-names>
          </name>
        </person-group>
        <article-title>Readability levels of patient education material on the World Wide Web</article-title>
        <source>J Fam Pract</source>  
        <year>1999</year>  
        <month>01</month>  
        <volume>48</volume>  
        <issue>1</issue>  
        <fpage>58</fpage>  
        <lpage>61</lpage>  
        <pub-id pub-id-type="medline">9934385</pub-id></nlm-citation>
      </ref>
      <ref id="ref7">
        <label>7</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Fitzsimmons</surname>
            <given-names>PR</given-names>
          </name>
          <name name-style="western">
            <surname>Michael</surname>
            <given-names>B</given-names>
          </name>
          <name name-style="western">
            <surname>Hulley</surname>
            <given-names>J</given-names>
          </name>
          <name name-style="western">
            <surname>Scott</surname>
            <given-names>G</given-names>
          </name>
        </person-group>
        <article-title>A readability assessment of online Parkinson's disease information</article-title>
        <source>J R Coll Physicians Edinb</source>  
        <year>2010</year>  
        <month>12</month>  
        <volume>40</volume>  
        <issue>4</issue>  
        <fpage>292</fpage>  
        <lpage>296</lpage>  
        <pub-id pub-id-type="doi">10.4997/JRCPE.2010.401</pub-id>
        <pub-id pub-id-type="medline">21132132</pub-id></nlm-citation>
      </ref>
      <ref id="ref8">
        <label>8</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Wiener</surname>
            <given-names>RC</given-names>
          </name>
          <name name-style="western">
            <surname>Wiener-Pla</surname>
            <given-names>R</given-names>
          </name>
        </person-group>
        <article-title>Literacy, pregnancy and potential oral health changes: the internet and readability levels</article-title>
        <source>Matern Child Health J</source>  
        <year>2014</year>  
        <month>04</month>  
        <volume>18</volume>  
        <issue>3</issue>  
        <fpage>657</fpage>  
        <lpage>662</lpage>  
        <pub-id pub-id-type="doi">10.1007/s10995-013-1290-1</pub-id>
        <pub-id pub-id-type="medline">23784613</pub-id>
        <pub-id pub-id-type="pmcid">PMC4919661</pub-id></nlm-citation>
      </ref>
      <ref id="ref9">
        <label>9</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Patel</surname>
            <given-names>CR</given-names>
          </name>
          <name name-style="western">
            <surname>Cherla</surname>
            <given-names>DV</given-names>
          </name>
          <name name-style="western">
            <surname>Sanghvi</surname>
            <given-names>S</given-names>
          </name>
          <name name-style="western">
            <surname>Baredes</surname>
            <given-names>S</given-names>
          </name>
          <name name-style="western">
            <surname>Eloy</surname>
            <given-names>JA</given-names>
          </name>
        </person-group>
        <article-title>Readability assessment of online thyroid surgery patient education materials</article-title>
        <source>Head Neck</source>  
        <year>2013</year>  
        <month>10</month>  
        <volume>35</volume>  
        <issue>10</issue>  
        <fpage>1421</fpage>  
        <lpage>1425</lpage>  
        <pub-id pub-id-type="doi">10.1002/hed.23157</pub-id>
        <pub-id pub-id-type="medline">22972634</pub-id></nlm-citation>
      </ref>
      <ref id="ref10">
        <label>10</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Meillier</surname>
            <given-names>A</given-names>
          </name>
          <name name-style="western">
            <surname>Patel</surname>
            <given-names>S</given-names>
          </name>
        </person-group>
        <article-title>Readability of healthcare literature for gastroparesis and evaluation of medical terminology in reading difficulty</article-title>
        <source>Gastroenterology Res</source>  
        <year>2017</year>  
        <month>02</month>  
        <volume>10</volume>  
        <issue>1</issue>  
        <fpage>1</fpage>  
        <lpage>5</lpage>  
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/28270870"/>
        </comment>  
        <pub-id pub-id-type="doi">10.14740/gr746w</pub-id>
        <pub-id pub-id-type="medline">28270870</pub-id>
        <pub-id pub-id-type="pmcid">PMC5330686</pub-id></nlm-citation>
      </ref>
      <ref id="ref11">
        <label>11</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Ellimoottil</surname>
            <given-names>C</given-names>
          </name>
          <name name-style="western">
            <surname>Polcari</surname>
            <given-names>A</given-names>
          </name>
          <name name-style="western">
            <surname>Kadlec</surname>
            <given-names>A</given-names>
          </name>
          <name name-style="western">
            <surname>Gupta</surname>
            <given-names>G</given-names>
          </name>
        </person-group>
        <article-title>Readability of websites containing information about prostate cancer treatment options</article-title>
        <source>J Urol</source>  
        <year>2012</year>  
        <month>12</month>  
        <volume>188</volume>  
        <issue>6</issue>  
        <fpage>2171</fpage>  
        <lpage>2176</lpage>  
        <pub-id pub-id-type="doi">10.1016/j.juro.2012.07.105</pub-id>
        <pub-id pub-id-type="medline">23083852</pub-id>
        <pub-id pub-id-type="pii">S0022-5347(12)04411-4</pub-id></nlm-citation>
      </ref>
      <ref id="ref12">
        <label>12</label>
        <nlm-citation citation-type="web">
        <source>University of Waterloo</source>  
        <year>2016</year>  
        <access-date>2018-03-29</access-date>
        <comment>Cura Te Ipsum: Answering Symptom Queries with Question Intent 
        <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="http://plg2.cs.uwaterloo.ca/~avtyurin/WebQA2016/">http://plg2.cs.uwaterloo.ca/~avtyurin/WebQA2016/</ext-link>
        <ext-link ext-link-type="webcite" xlink:href="6yHTeM33k"/></comment> </nlm-citation>
      </ref>
      <ref id="ref13">
        <label>13</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Boyer</surname>
            <given-names>C</given-names>
          </name>
          <name name-style="western">
            <surname>Dolamic</surname>
            <given-names>L</given-names>
          </name>
        </person-group>
        <article-title>Automated detection of HONcode website conformity compared to manual detection: an evaluation</article-title>
        <source>J Med Internet Res</source>  
        <year>2015</year>  
        <month>06</month>  
        <day>02</day>  
        <volume>17</volume>  
        <issue>6</issue>  
        <fpage>e135</fpage>  
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="http://www.jmir.org/2015/6/e135/"/>
        </comment>  
        <pub-id pub-id-type="doi">10.2196/jmir.3831</pub-id>
        <pub-id pub-id-type="medline">26036669</pub-id>
        <pub-id pub-id-type="pii">v17i6e135</pub-id>
        <pub-id pub-id-type="pmcid">PMC4526900</pub-id></nlm-citation>
      </ref>
      <ref id="ref14">
        <label>14</label>
        <nlm-citation citation-type="confproc">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Zuccon</surname>
            <given-names>G</given-names>
          </name>
          <name name-style="western">
            <surname>Palotti</surname>
            <given-names>J</given-names>
          </name>
          <name name-style="western">
            <surname>Goeuriot</surname>
            <given-names>L</given-names>
          </name>
          <name name-style="western">
            <surname>Kelly</surname>
            <given-names>L</given-names>
          </name>
          <name name-style="western">
            <surname>Lupu</surname>
            <given-names>M</given-names>
          </name>
          <name name-style="western">
            <surname>Pecina</surname>
            <given-names>P</given-names>
          </name>
        </person-group>
        <article-title>The IR Task at the CLEF eHealth evaluation lab 2016: User-centred Health Information Retrieval</article-title>
        <year>2016</year>  
        <month>09</month>  
        <conf-name>Conference and Labs of the Evaluation Forum</conf-name>
        <conf-date>September 5-8, 2016</conf-date>
        <conf-loc>Évora, Portugal</conf-loc>
        <publisher-name>CEUR-WS</publisher-name>
        <fpage>15</fpage>  
        <lpage>27</lpage>  
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="http://ceur-ws.org/Vol-1609/16090015.pdf"/>
        </comment> </nlm-citation>
      </ref>
      <ref id="ref15">
        <label>15</label>
        <nlm-citation citation-type="book">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Doak</surname>
            <given-names>C</given-names>
          </name>
          <name name-style="western">
            <surname>Doak</surname>
            <given-names>L</given-names>
          </name>
          <name name-style="western">
            <surname>Root</surname>
            <given-names>J</given-names>
          </name>
        </person-group>
        <source>Teaching Patients with Low Literacy Skills</source>  
        <year>1995</year>  
        <publisher-loc>Philadelphia, Pennsylvania, United States</publisher-loc>
        <publisher-name>Lippincott Williams and Wilkins</publisher-name></nlm-citation>
      </ref>
      <ref id="ref16">
        <label>16</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Wallace</surname>
            <given-names>L</given-names>
          </name>
          <name name-style="western">
            <surname>Lennon</surname>
            <given-names>E</given-names>
          </name>
        </person-group>
        <article-title>American Academy of Family Physicians patient education materials: can patients read them?</article-title>
        <source>Fam Med</source>  
        <year>2004</year>  
        <month>09</month>  
        <volume>36</volume>  
        <issue>8</issue>  
        <fpage>571</fpage>  
        <lpage>574</lpage>  
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="http://www.stfm.org/fmhub/fm2004/September/Lorraine571.pdf"/>
        </comment>  
        <pub-id pub-id-type="medline">15343418</pub-id></nlm-citation>
      </ref>
      <ref id="ref17">
        <label>17</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Davis</surname>
            <given-names>T</given-names>
          </name>
          <name name-style="western">
            <surname>Wolf</surname>
            <given-names>M</given-names>
          </name>
        </person-group>
        <article-title>Health literacy: implications for family medicine</article-title>
        <source>Fam Med</source>  
        <year>2004</year>  
        <month>09</month>  
        <volume>36</volume>  
        <issue>8</issue>  
        <fpage>595</fpage>  
        <lpage>598</lpage>  
        <pub-id pub-id-type="medline">15343422</pub-id></nlm-citation>
      </ref>
      <ref id="ref18">
        <label>18</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Stossel</surname>
            <given-names>L</given-names>
          </name>
          <name name-style="western">
            <surname>Segar</surname>
            <given-names>N</given-names>
          </name>
          <name name-style="western">
            <surname>Gliatto</surname>
            <given-names>P</given-names>
          </name>
          <name name-style="western">
            <surname>Fallar</surname>
            <given-names>R</given-names>
          </name>
          <name name-style="western">
            <surname>Karani</surname>
            <given-names>R</given-names>
          </name>
        </person-group>
        <article-title>Readability of patient education materials available at the point of care</article-title>
        <source>J Gen Intern Med</source>  
        <year>2012</year>  
        <month>09</month>  
        <volume>27</volume>  
        <issue>9</issue>  
        <fpage>1165</fpage>  
        <lpage>1170</lpage>  
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/22528620"/>
        </comment>  
        <pub-id pub-id-type="doi">10.1007/s11606-012-2046-0</pub-id>
        <pub-id pub-id-type="medline">22528620</pub-id>
        <pub-id pub-id-type="pmcid">PMC3514986</pub-id></nlm-citation>
      </ref>
      <ref id="ref19">
        <label>19</label>
        <nlm-citation citation-type="web">
        <source>NIH publication</source>  
        <access-date>2017-09-01</access-date>
        <comment>Clear &amp; Simple: Developing Effective Print Materials for Low-literate Readers 
        <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="https://www.nih.gov/institutes-nih/nih-office-director/office-communications-public-liaison/clear-communication/">https://www.nih.gov/institutes-nih/nih-office-director/office-communications-public-liaison/clear-communication/</ext-link>
        <ext-link ext-link-type="webcite" xlink:href="6yHTsSTK7"/></comment> </nlm-citation>
      </ref>
      <ref id="ref20">
        <label>20</label>
        <nlm-citation citation-type="confproc">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Palotti</surname>
            <given-names>J</given-names>
          </name>
          <name name-style="western">
            <surname>Zuccon</surname>
            <given-names>G</given-names>
          </name>
          <name name-style="western">
            <surname>Hanbury</surname>
            <given-names>A</given-names>
          </name>
        </person-group>
        <article-title>The Influence of Pre-processing on the Estimation of Readability of Web Documents</article-title>
        <year>2015</year>  
        <conf-name>Proceedings of the 24th ACM International on Conference on Information and Knowledge Management</conf-name>
        <conf-date>October 18 - 23, 2015</conf-date>
        <conf-loc>Melbourne, Australia</conf-loc>
        <publisher-loc>USA</publisher-loc>
        <publisher-name>ACM</publisher-name>
        <fpage>1763</fpage>  
        <lpage>1766</lpage>  
        <pub-id pub-id-type="doi">10.1145/2806416.2806613</pub-id></nlm-citation>
      </ref>
      <ref id="ref21">
        <label>21</label>
        <nlm-citation citation-type="confproc">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Palotti</surname>
            <given-names>J</given-names>
          </name>
          <name name-style="western">
            <surname>Goeuriot</surname>
            <given-names>L</given-names>
          </name>
          <name name-style="western">
            <surname>Zuccon</surname>
            <given-names>G</given-names>
          </name>
          <name name-style="western">
            <surname>Hanbury</surname>
            <given-names>A</given-names>
          </name>
        </person-group>
        <article-title>Ranking Health Web Pages with Relevance and Understandability</article-title>
        <year>2016</year>  
        <conf-name>Proceedings of the 39th International ACM SIGIR conference on Research and Development in Information Retrieval</conf-name>
        <conf-date>July 17 - 21, 2016</conf-date>
        <conf-loc>Pisa, Italy</conf-loc>
        <fpage>965</fpage>  
        <lpage>968</lpage>  
        <pub-id pub-id-type="doi">10.1145/2911451.2914741</pub-id></nlm-citation>
      </ref>
      <ref id="ref22">
        <label>22</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Shoemaker</surname>
            <given-names>SJ</given-names>
          </name>
          <name name-style="western">
            <surname>Wolf</surname>
            <given-names>MS</given-names>
          </name>
          <name name-style="western">
            <surname>Brach</surname>
            <given-names>C</given-names>
          </name>
        </person-group>
        <article-title>Development of the Patient Education Materials Assessment Tool (PEMAT): a new measure of understandability and actionability for print and audiovisual patient information</article-title>
        <source>Patient Educ Couns</source>  
        <year>2014</year>  
        <month>09</month>  
        <volume>96</volume>  
        <issue>3</issue>  
        <fpage>395</fpage>  
        <lpage>403</lpage>  
        <pub-id pub-id-type="doi">10.1016/j.pec.2014.05.027</pub-id>
        <pub-id pub-id-type="medline">24973195</pub-id>
        <pub-id pub-id-type="pii">S0738-3991(14)00233-X</pub-id></nlm-citation>
      </ref>
      <ref id="ref23">
        <label>23</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Becker</surname>
            <given-names>SA</given-names>
          </name>
        </person-group>
        <article-title>A study of web usability for older adults seeking online health resources</article-title>
        <source>ACM Trans Comput-Hum Interact</source>  
        <year>2004</year>  
        <month>12</month>  
        <day>01</day>  
        <volume>11</volume>  
        <issue>4</issue>  
        <fpage>387</fpage>  
        <lpage>406</lpage>  
        <pub-id pub-id-type="doi">10.1145/1035575.1035578</pub-id></nlm-citation>
      </ref>
      <ref id="ref24">
        <label>24</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Zheng</surname>
            <given-names>J</given-names>
          </name>
          <name name-style="western">
            <surname>Yu</surname>
            <given-names>H</given-names>
          </name>
        </person-group>
        <article-title>Readability formulas and user perceptions of electronic health records difficulty: a corpus study</article-title>
        <source>J Med Internet Res</source>  
        <year>2017</year>  
        <month>03</month>  
        <day>02</day>  
        <volume>19</volume>  
        <issue>3</issue>  
        <fpage>e59</fpage>  
        <pub-id pub-id-type="doi">10.2196/jmir.6962</pub-id>
        <pub-id pub-id-type="medline">28254738</pub-id></nlm-citation>
      </ref>
      <ref id="ref25">
        <label>25</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Coleman</surname>
            <given-names>M</given-names>
          </name>
          <name name-style="western">
            <surname>Liau</surname>
            <given-names>TL</given-names>
          </name>
        </person-group>
        <article-title>A computer readability formula designed for machine scoring</article-title>
        <source>J Appl Psychol</source>  
        <year>1975</year>  
        <volume>60</volume>  
        <issue>2</issue>  
        <fpage>283</fpage>  
        <lpage>284</lpage>  
        <pub-id pub-id-type="doi">10.1037/h0076540</pub-id>
        <pub-id pub-id-type="medline">28425725</pub-id></nlm-citation>
      </ref>
      <ref id="ref26">
        <label>26</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Dale</surname>
            <given-names>E</given-names>
          </name>
          <name name-style="western">
            <surname>Chall</surname>
            <given-names>J</given-names>
          </name>
        </person-group>
        <article-title>A formula for predicting readability: instructions</article-title>
        <source>Educational Research Bulletin</source>  
        <year>1948</year>  
        <month>02</month>  
        <day>18</day>  
        <volume>27</volume>  
        <issue>2</issue>  
        <fpage>37</fpage>  
        <lpage>54</lpage>  
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="http://www.jstor.org/stable/1473669"/>
        </comment> </nlm-citation>
      </ref>
      <ref id="ref27">
        <label>27</label>
        <nlm-citation citation-type="web">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Kincaid</surname>
            <given-names>J</given-names>
          </name>
          <name name-style="western">
            <surname>Fishburne</surname>
            <given-names>R</given-names>
          </name>
          <name name-style="western">
            <surname>Rogers</surname>
            <given-names>R</given-names>
          </name>
          <name name-style="western">
            <surname>Chissom</surname>
            <given-names>B</given-names>
          </name>
        </person-group>
        <source>National Technical Information Service</source>  
        <year>1975</year>  
        <month>02</month>  
        <access-date>2018-12-04</access-date>
        <publisher-loc>Florida, USA</publisher-loc>
        <publisher-name>Institute for Simulation and Training, University of Central Florida</publisher-name>
        <comment>Derivation of New Readability Formulas for Navy Enlisted Personnel 
        <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="http://www.dtic.mil/dtic/tr/fulltext/u2/a006655.pdf">http://www.dtic.mil/dtic/tr/fulltext/u2/a006655.pdf</ext-link>
        <ext-link ext-link-type="webcite" xlink:href="74PMHgBnJ"/></comment> </nlm-citation>
      </ref>
      <ref id="ref28">
        <label>28</label>
        <nlm-citation citation-type="web">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Dubay</surname>
            <given-names>W</given-names>
          </name>
        </person-group>
        <source>Impact Information</source>  
        <year>2004</year>  
        <month>08</month>  
        <day>25</day>  
        <access-date>2018-11-28</access-date>
        <publisher-loc>Costa Mesa, CA</publisher-loc>
        <publisher-name>Impact Information</publisher-name>
        <comment>The Principles of Readability 
        <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="http://www.impact-information.com/impactinfo/readability02.pdf">http://www.impact-information.com/impactinfo/readability02.pdf</ext-link>
        <ext-link ext-link-type="webcite" xlink:href="74GkzsGi0"/></comment> </nlm-citation>
      </ref>
      <ref id="ref29">
        <label>29</label>
        <nlm-citation citation-type="confproc">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Liu</surname>
            <given-names>X</given-names>
          </name>
          <name name-style="western">
            <surname>Croft</surname>
            <given-names>W</given-names>
          </name>
          <name name-style="western">
            <surname>Oh</surname>
            <given-names>P</given-names>
          </name>
          <name name-style="western">
            <surname>Hart</surname>
            <given-names>D</given-names>
          </name>
        </person-group>
        <article-title>Automatic Recognition of Reading Levels from User Queries</article-title>
        <year>2004</year>  
        <conf-name>Proceedings of the 27th annual international ACM SIGIR conference on Research and development in information retrieval</conf-name>
        <conf-date>July 25 - 29, 2004</conf-date>
        <conf-loc>Sheffield, United Kingdom</conf-loc>
        <fpage>548</fpage>  
        <lpage>549</lpage>  
        <pub-id pub-id-type="doi">10.1145/1008992.1009114</pub-id></nlm-citation>
      </ref>
      <ref id="ref30">
        <label>30</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Collins-Thompson</surname>
            <given-names>K</given-names>
          </name>
          <name name-style="western">
            <surname>Callan</surname>
            <given-names>J</given-names>
          </name>
        </person-group>
        <article-title>Predicting reading difficulty with statistical language models</article-title>
        <source>J Am Soc Inf Sci</source>  
        <year>2005</year>  
        <month>11</month>  
        <volume>56</volume>  
        <issue>13</issue>  
        <fpage>1448</fpage>  
        <lpage>1462</lpage>  
        <pub-id pub-id-type="doi">10.1002/asi.20243</pub-id></nlm-citation>
      </ref>
      <ref id="ref31">
        <label>31</label>
        <nlm-citation citation-type="confproc">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Heilman</surname>
            <given-names>M</given-names>
          </name>
          <name name-style="western">
            <surname>Collins-Thompson</surname>
            <given-names>K</given-names>
          </name>
          <name name-style="western">
            <surname>Callan</surname>
            <given-names>J</given-names>
          </name>
          <name name-style="western">
            <surname>Eskenazi</surname>
            <given-names>M</given-names>
          </name>
        </person-group>
        <article-title>Combining lexical and grammatical features to improve readability measures for first and second language texts</article-title>
        <year>2007</year>  
        <conf-name>The Conference of the North American Chapter of the Association for Computational Linguistics</conf-name>
        <conf-date>April 22-27, 2007</conf-date>
        <conf-loc>Rochester, New York</conf-loc>
        <fpage>460</fpage>  
        <lpage>467</lpage> </nlm-citation>
      </ref>
      <ref id="ref32">
        <label>32</label>
        <nlm-citation citation-type="confproc">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Pitler</surname>
            <given-names>E</given-names>
          </name>
          <name name-style="western">
            <surname>Nenkova</surname>
            <given-names>A</given-names>
          </name>
        </person-group>
        <article-title>Revisiting Readability: A Unified Framework for Predicting Text Quality</article-title>
        <year>2008</year>  
        <conf-name>Proceedings of the Conference on Empirical Methods in Natural Language Processing</conf-name>
        <conf-date>October 25 - 27, 2008</conf-date>
        <conf-loc>Honolulu, Hawaii</conf-loc>
        <fpage>186</fpage>  
        <lpage>195</lpage>  
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="http://dl.acm.org/citation.cfm?id=1613715.1613742"/>
        </comment> </nlm-citation>
      </ref>
      <ref id="ref33">
        <label>33</label>
        <nlm-citation citation-type="confproc">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Zeng</surname>
            <given-names>Q</given-names>
          </name>
          <name name-style="western">
            <surname>Kim</surname>
            <given-names>E</given-names>
          </name>
          <name name-style="western">
            <surname>Crowell</surname>
            <given-names>J</given-names>
          </name>
          <name name-style="western">
            <surname>Tse</surname>
            <given-names>T</given-names>
          </name>
        </person-group>
        <article-title>A Text Corpora-based Estimation of the Familiarity of Health Terminology</article-title>
        <year>2005</year>  
        <conf-name>Proceedings of the 6th International conference on Biological and Medical Data Analysis</conf-name>
        <conf-date>November 10 - 11, 2005</conf-date>
        <conf-loc>Aveiro, Portugal</conf-loc>
        <fpage>184</fpage>  
        <lpage>192</lpage>  
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="https://dl.acm.org/citation.cfm?id=2146189"/>
        </comment>  
        <pub-id pub-id-type="doi">10.1007/11573067_19</pub-id></nlm-citation>
      </ref>
      <ref id="ref34">
        <label>34</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Zeng</surname>
            <given-names>QT</given-names>
          </name>
          <name name-style="western">
            <surname>Tse</surname>
            <given-names>T</given-names>
          </name>
        </person-group>
        <article-title>Exploring and developing consumer health vocabularies</article-title>
        <source>J Am Med Inform Assoc</source>  
        <year>2006</year>  
        <volume>13</volume>  
        <issue>1</issue>  
        <fpage>24</fpage>  
        <lpage>29</lpage>  
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/16221948"/>
        </comment>  
        <pub-id pub-id-type="doi">10.1197/jamia.M1761</pub-id>
        <pub-id pub-id-type="medline">16221948</pub-id>
        <pub-id pub-id-type="pii">M1761</pub-id>
        <pub-id pub-id-type="pmcid">PMC1380193</pub-id></nlm-citation>
      </ref>
      <ref id="ref35">
        <label>35</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>McCloskey</surname>
            <given-names>M</given-names>
          </name>
          <name name-style="western">
            <surname>Caramazza</surname>
            <given-names>A</given-names>
          </name>
          <name name-style="western">
            <surname>Basili</surname>
            <given-names>A</given-names>
          </name>
        </person-group>
        <article-title>Cognitive mechanisms in number processing and calculation: evidence from dyscalculia</article-title>
        <source>Brain Cogn</source>  
        <year>1985</year>  
        <month>04</month>  
        <volume>4</volume>  
        <issue>2</issue>  
        <fpage>171</fpage>  
        <lpage>196</lpage>  
        <pub-id pub-id-type="medline">2409994</pub-id>
        <pub-id pub-id-type="pii">0278-2626(85)90069-7</pub-id></nlm-citation>
      </ref>
      <ref id="ref36">
        <label>36</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Leroy</surname>
            <given-names>G</given-names>
          </name>
          <name name-style="western">
            <surname>Helmreich</surname>
            <given-names>S</given-names>
          </name>
          <name name-style="western">
            <surname>Cowie</surname>
            <given-names>J</given-names>
          </name>
          <name name-style="western">
            <surname>Miller</surname>
            <given-names>T</given-names>
          </name>
          <name name-style="western">
            <surname>Zheng</surname>
            <given-names>W</given-names>
          </name>
        </person-group>
        <article-title>Evaluating online health information: beyond readability formulas</article-title>
        <source>AMIA Annu Symp Proc</source>  
        <year>2008</year>  
        <month>11</month>  
        <day>06</day>  
        <volume>2008</volume>  
        <fpage>394</fpage>  
        <lpage>8</lpage>  
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/18998902"/>
        </comment>  
        <pub-id pub-id-type="medline">18998902</pub-id>
        <pub-id pub-id-type="pmcid">PMC2656067</pub-id></nlm-citation>
      </ref>
      <ref id="ref37">
        <label>37</label>
        <nlm-citation citation-type="confproc">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Palotti</surname>
            <given-names>J</given-names>
          </name>
          <name name-style="western">
            <surname>Hanbury</surname>
            <given-names>A</given-names>
          </name>
          <name name-style="western">
            <surname>Muller</surname>
            <given-names>H</given-names>
          </name>
        </person-group>
        <article-title>Exploiting Health Related Features to Infer User Expertise in the Medical Domain</article-title>
        <year>2014</year>  
        <conf-name>Proceedings of WSCD Workshop on Web Search and Data Mining</conf-name>
        <conf-date>February 24-28, 2014</conf-date>
        <conf-loc>New York, USA</conf-loc>
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="http://publications.hevs.ch/index.php/publications/show/1632"/>
        </comment> </nlm-citation>
      </ref>
      <ref id="ref38">
        <label>38</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Yan</surname>
            <given-names>X</given-names>
          </name>
          <name name-style="western">
            <surname>Lau</surname>
            <given-names>RY</given-names>
          </name>
          <name name-style="western">
            <surname>Song</surname>
            <given-names>D</given-names>
          </name>
          <name name-style="western">
            <surname>Li</surname>
            <given-names>X</given-names>
          </name>
          <name name-style="western">
            <surname>Ma</surname>
            <given-names>J</given-names>
          </name>
        </person-group>
        <article-title>Toward a semantic granularity model for domain-specific information retrieval</article-title>
        <source>ACM Trans Inf Syst</source>  
        <year>2011</year>  
        <month>07</month>  
        <day>01</day>  
        <volume>29</volume>  
        <issue>3</issue>  
        <fpage>1</fpage>  
        <lpage>46</lpage>  
        <pub-id pub-id-type="doi">10.1145/1993036.1993039</pub-id></nlm-citation>
      </ref>
      <ref id="ref39">
        <label>39</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Kim</surname>
            <given-names>H</given-names>
          </name>
          <name name-style="western">
            <surname>Goryachev</surname>
            <given-names>S</given-names>
          </name>
          <name name-style="western">
            <surname>Rosemblat</surname>
            <given-names>G</given-names>
          </name>
          <name name-style="western">
            <surname>Browne</surname>
            <given-names>A</given-names>
          </name>
          <name name-style="western">
            <surname>Keselman</surname>
            <given-names>A</given-names>
          </name>
          <name name-style="western">
            <surname>Zeng-Treitler</surname>
            <given-names>Q</given-names>
          </name>
        </person-group>
        <article-title>Beyond surface characteristics: a new health text-specific readability measurement</article-title>
        <source>AMIA Annu Symp Proc</source>  
        <year>2007</year>  
        <month>10</month>  
        <day>11</day>  
        <volume>2007</volume>  
        <fpage>418</fpage>  
        <lpage>22</lpage>  
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/18693870"/>
        </comment>  
        <pub-id pub-id-type="medline">18693870</pub-id>
        <pub-id pub-id-type="pmcid">PMC2655856</pub-id></nlm-citation>
      </ref>
      <ref id="ref40">
        <label>40</label>
        <nlm-citation citation-type="confproc">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>van Doorn</surname>
            <given-names>J</given-names>
          </name>
          <name name-style="western">
            <surname>Odijk</surname>
            <given-names>D</given-names>
          </name>
          <name name-style="western">
            <surname>Roijers</surname>
            <given-names>D</given-names>
          </name>
          <name name-style="western">
            <surname>de Rijke</surname>
            <given-names>M</given-names>
          </name>
        </person-group>
        <article-title>Balancing Relevance Criteria through Multi-objective Optimization</article-title>
        <year>2016</year>  
        <conf-name>Proceedings of the 39th International ACM SIGIR conference on Research and Development in Information Retrieval</conf-name>
        <conf-date>July 17 - 21, 2016</conf-date>
        <conf-loc>Pisa, Italy</conf-loc>
        <fpage>769</fpage>  
        <lpage>772</lpage>  
        <pub-id pub-id-type="doi">10.1145/2911451.2914708</pub-id></nlm-citation>
      </ref>
      <ref id="ref41">
        <label>41</label>
        <nlm-citation citation-type="confproc">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Zuccon</surname>
            <given-names>G</given-names>
          </name>
          <name name-style="western">
            <surname>Koopman</surname>
            <given-names>B</given-names>
          </name>
        </person-group>
        <article-title>Integrating Understandability in the Evaluation of Consumer Health Search Engines</article-title>
        <year>2014</year>  
        <conf-name>Medical Information Retrieval (MedIR) Workshop</conf-name>
        <conf-date>July 11 2014</conf-date>
        <conf-loc>Gold Coast, Australia</conf-loc>
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="https://eprints.qut.edu.au/72854/"/>
        </comment> </nlm-citation>
      </ref>
      <ref id="ref42">
        <label>42</label>
        <nlm-citation citation-type="confproc">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Zuccon</surname>
            <given-names>G</given-names>
          </name>
        </person-group>
        <article-title>Understandability Biased Evaluation for Information Retrieval</article-title>
        <year>2016</year>  
        <month>03</month>  
        <conf-name>European Conference on Information Retrieval</conf-name>
        <conf-date>March 20-23, 2016</conf-date>
        <conf-loc>Padua, Italy</conf-loc>
        <fpage>280</fpage>  
        <lpage>292</lpage>  
        <pub-id pub-id-type="doi">10.1007/978-3-319-30671-1_21</pub-id></nlm-citation>
      </ref>
      <ref id="ref43">
        <label>43</label>
        <nlm-citation citation-type="web">
        <source>CLEF eHealth</source>  
        <access-date>2018-04-23</access-date>
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="https://sites.google.com/site/clefehealth/">https://sites.google.com/site/clefehealth/</ext-link>
          <ext-link ext-link-type="webcite" xlink:href="6ytaboB6c"/>
        </comment> </nlm-citation>
      </ref>
      <ref id="ref44">
        <label>44</label>
        <nlm-citation citation-type="confproc">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Palotti</surname>
            <given-names>J</given-names>
          </name>
          <name name-style="western">
            <surname>Zuccon</surname>
            <given-names>G</given-names>
          </name>
          <name name-style="western">
            <surname>Goeuriot</surname>
            <given-names>L</given-names>
          </name>
          <name name-style="western">
            <surname>Kelly</surname>
            <given-names>L</given-names>
          </name>
          <name name-style="western">
            <surname>Hanbury</surname>
            <given-names>A</given-names>
          </name>
          <name name-style="western">
            <surname>Jones</surname>
            <given-names>G</given-names>
          </name>
        </person-group>
        <article-title>ShARe/CLEF eHealth Evaluation Lab 2015, Task 2: User-centred Health Information Retrieval</article-title>
        <year>2015</year>  
        <conf-name>Conference and Labs of the Evaluation Forum</conf-name>
        <conf-date>September 8-11, 2015</conf-date>
        <conf-loc>Toulouse, France</conf-loc>
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="http://ceur-ws.org/Vol-1391/inv-pap9-CR.pdf"/>
        </comment> </nlm-citation>
      </ref>
      <ref id="ref45">
        <label>45</label>
        <nlm-citation citation-type="web">
        <source>Lemur</source>  
        <access-date>2018-04-17</access-date>
        <comment>The ClueWeb12 Dataset 
        <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="http://lemurproject.org/clueweb12/">http://lemurproject.org/clueweb12/</ext-link>
        <ext-link ext-link-type="webcite" xlink:href="6ykSxv4Hp"/></comment> </nlm-citation>
      </ref>
      <ref id="ref46">
        <label>46</label>
        <nlm-citation citation-type="confproc">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Palotti</surname>
            <given-names>J</given-names>
          </name>
          <name name-style="western">
            <surname>Zuccon</surname>
            <given-names>G</given-names>
          </name>
          <name name-style="western">
            <surname>Bernhardt</surname>
            <given-names>J</given-names>
          </name>
          <name name-style="western">
            <surname>Hanbury</surname>
            <given-names>A</given-names>
          </name>
          <name name-style="western">
            <surname>Goeuriot</surname>
            <given-names>L</given-names>
          </name>
        </person-group>
        <article-title>Assessors agreement: A case study across assessor type, payment levels, query variations and relevance dimensions</article-title>
        <year>2016</year>  
        <conf-name>Conference and Labs of the Evaluation Forum</conf-name>
        <conf-date>September 5-8, 2016</conf-date>
        <conf-loc>Évora, Portugal</conf-loc>
        <publisher-name>Springer International Publishing</publisher-name>
        <fpage>40</fpage>  
        <lpage>53</lpage>  
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="https://publik.tuwien.ac.at/files/publik_257829.pdf"/>
        </comment>  
        <pub-id pub-id-type="doi">10.1007/978-3-319-44564-9_4</pub-id></nlm-citation>
      </ref>
      <ref id="ref47">
        <label>47</label>
        <nlm-citation citation-type="confproc">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Koopman</surname>
            <given-names>B</given-names>
          </name>
          <name name-style="western">
            <surname>Zuccon</surname>
            <given-names>G</given-names>
          </name>
        </person-group>
        <article-title>Relevation!: An open source system for information retrieval relevance assessment</article-title>
        <year>2014</year>  
        <conf-name>Proceedings of the 37th international ACM SIGIR conference on Research &amp; development in information retrieval</conf-name>
        <conf-date>July 06 - 11, 2014</conf-date>
        <conf-loc>Gold Coast, Queensland, Australia</conf-loc>
        <fpage>1243</fpage>  
        <lpage>1244</lpage>  
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="https://eprints.qut.edu.au/72102/"/>
        </comment>  
        <pub-id pub-id-type="doi">10.1145/2600428.2611175</pub-id></nlm-citation>
      </ref>
      <ref id="ref48">
        <label>48</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Smith</surname>
            <given-names>EA</given-names>
          </name>
          <name name-style="western">
            <surname>Senter</surname>
            <given-names>RJ</given-names>
          </name>
        </person-group>
        <article-title>Automated readability index</article-title>
        <source>AMRL TR</source>  
        <year>1967</year>  
        <month>05</month>  
        <fpage>1</fpage>  
        <lpage>14</lpage>  
        <pub-id pub-id-type="medline">5302480</pub-id></nlm-citation>
      </ref>
      <ref id="ref49">
        <label>49</label>
        <nlm-citation citation-type="book">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Gunning</surname>
            <given-names>R</given-names>
          </name>
        </person-group>
        <source>The Technique of Clear Writing</source>  
        <year>1968</year>  
        <publisher-loc>New York City</publisher-loc>
        <publisher-name>McGraw Hill Higher Education</publisher-name></nlm-citation>
      </ref>
      <ref id="ref50">
        <label>50</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Bjornsson</surname>
            <given-names>CH</given-names>
          </name>
        </person-group>
        <article-title>Readability of Newspapers in 11 Languages</article-title>
        <source>Read Res Q</source>  
        <year>1983</year>  
        <volume>18</volume>  
        <issue>4</issue>  
        <fpage>480</fpage>  
        <lpage>497</lpage>  
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="http://www.jstor.org/stable/747382"/>
        </comment>  
        <pub-id pub-id-type="doi">10.2307/747382</pub-id></nlm-citation>
      </ref>
      <ref id="ref51">
        <label>51</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Mc Laughlin</surname>
            <given-names>G</given-names>
          </name>
        </person-group>
        <article-title>SMOG grading - a new readability formula</article-title>
        <source>Journal of Reading</source>  
        <year>1969</year>  
        <volume>12</volume>  
        <issue>8</issue>  
        <fpage>639</fpage>  
        <lpage>646</lpage>  
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="https://www.jstor.org/stable/40011226"/>
        </comment> </nlm-citation>
      </ref>
      <ref id="ref52">
        <label>52</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Collins-Thompson</surname>
            <given-names>K</given-names>
          </name>
        </person-group>
        <article-title>Computational assessment of text readability: a survey of current and future research</article-title>
        <source>ITL</source>  
        <year>2015</year>  
        <month>1</month>  
        <day>23</day>  
        <volume>165</volume>  
        <issue>2</issue>  
        <fpage>97</fpage>  
        <lpage>135</lpage>  
        <pub-id pub-id-type="doi">10.1075/itl.165.2.01col</pub-id></nlm-citation>
      </ref>
      <ref id="ref53">
        <label>53</label>
        <nlm-citation citation-type="web">
        <source>Pyphen</source>  
        <year>2017</year>  
        <access-date>2018-03-29</access-date>
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="http://pyphen.org">http://pyphen.org</ext-link>
          <ext-link ext-link-type="webcite" xlink:href="6yHSW2aHz"/>
        </comment> </nlm-citation>
      </ref>
      <ref id="ref54">
        <label>54</label>
        <nlm-citation citation-type="web">
        <source>OpenMedSpel (en-us)</source>  
        <year>2017</year>  
        <access-date>2018-10-21</access-date>
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="http://extensions.openoffice.org/en/project/openmedspel-en-us">http://extensions.openoffice.org/en/project/openmedspel-en-us</ext-link>
          <ext-link ext-link-type="webcite" xlink:href="6yHd3KTZc"/>
        </comment> </nlm-citation>
      </ref>
      <ref id="ref55">
        <label>55</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Zhou</surname>
            <given-names>W</given-names>
          </name>
          <name name-style="western">
            <surname>Torvik</surname>
            <given-names>VI</given-names>
          </name>
          <name name-style="western">
            <surname>Smalheiser</surname>
            <given-names>NR</given-names>
          </name>
        </person-group>
        <article-title>ADAM: another database of abbreviations in MEDLINE</article-title>
        <source>Bioinformatics</source>  
        <year>2006</year>  
        <month>11</month>  
        <day>15</day>  
        <volume>22</volume>  
        <issue>22</issue>  
        <fpage>2813</fpage>  
        <lpage>2818</lpage>  
        <pub-id pub-id-type="doi">10.1093/bioinformatics/btl480</pub-id>
        <pub-id pub-id-type="medline">16982707</pub-id>
        <pub-id pub-id-type="pii">btl480</pub-id></nlm-citation>
      </ref>
      <ref id="ref56">
        <label>56</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Aronson</surname>
            <given-names>AR</given-names>
          </name>
          <name name-style="western">
            <surname>Lang</surname>
            <given-names>F</given-names>
          </name>
        </person-group>
        <article-title>An overview of MetaMap: historical perspective and recent advances</article-title>
        <source>J Am Med Inform Assoc</source>  
        <year>2010</year>  
        <volume>17</volume>  
        <issue>3</issue>  
        <fpage>229</fpage>  
        <lpage>36</lpage>  
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/20442139"/>
        </comment>  
        <pub-id pub-id-type="doi">10.1136/jamia.2009.002733</pub-id>
        <pub-id pub-id-type="medline">20442139</pub-id>
        <pub-id pub-id-type="pii">17/3/229</pub-id>
        <pub-id pub-id-type="pmcid">PMC2995713</pub-id></nlm-citation>
      </ref>
      <ref id="ref57">
        <label>57</label>
        <nlm-citation citation-type="web">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Pang</surname>
            <given-names>C</given-names>
          </name>
        </person-group>
        <source>Minerva Access</source>  
        <year>2016</year>  
        <access-date>2018-11-28</access-date>
        <comment>Understanding Exploratory Search in Seeking Health Information 
        <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="https://minerva-access.unimelb.edu.au/handle/11343/115239">https://minerva-access.unimelb.edu.au/handle/11343/115239</ext-link>
        <ext-link ext-link-type="webcite" xlink:href="74GoJu166"/></comment> </nlm-citation>
      </ref>
      <ref id="ref58">
        <label>58</label>
        <nlm-citation citation-type="confproc">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Agrafiotes</surname>
            <given-names>C</given-names>
          </name>
          <name name-style="western">
            <surname>Arampatzis</surname>
            <given-names>A</given-names>
          </name>
        </person-group>
        <article-title>Augmenting Medical Queries with UMLS Concepts via MetaMap</article-title>
        <year>2016</year>  
        <conf-name>Proceedings of The Twenty-Fifth Text REtrieval Conference</conf-name>
        <conf-date>November 15-18, 2016</conf-date>
        <conf-loc>Gaithersburg, Maryland, USA</conf-loc>
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="https://trec.nist.gov/pubs/trec25/papers/DUTH-CL.pdf"/>
        </comment> </nlm-citation>
      </ref>
      <ref id="ref59">
        <label>59</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Palotti</surname>
            <given-names>J</given-names>
          </name>
          <name name-style="western">
            <surname>Hanbury</surname>
            <given-names>A</given-names>
          </name>
          <name name-style="western">
            <surname>Müller</surname>
            <given-names>H</given-names>
          </name>
          <name name-style="western">
            <surname>Kahn</surname>
            <given-names>CE</given-names>
          </name>
        </person-group>
        <article-title>How users search and what they search for in the medical domain</article-title>
        <source>Inf Retrieval J</source>  
        <year>2015</year>  
        <month>10</month>  
        <day>24</day>  
        <volume>19</volume>  
        <issue>1-2</issue>  
        <fpage>189</fpage>  
        <lpage>224</lpage>  
        <pub-id pub-id-type="doi">10.1007/s10791-015-9269-8</pub-id></nlm-citation>
      </ref>
      <ref id="ref60">
        <label>60</label>
        <nlm-citation citation-type="confproc">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Yates</surname>
            <given-names>A</given-names>
          </name>
          <name name-style="western">
            <surname>Goharian</surname>
            <given-names>N</given-names>
          </name>
        </person-group>
        <article-title>ADRTrace: Detecting Expected and Unexpected Adverse Drug Reactions from User Reviews on Social Media Sites</article-title>
        <year>2013</year>  
        <conf-name>European Conference on Information Retrieval</conf-name>
        <conf-date>24-27 March</conf-date>
        <conf-loc>Moscow, Russia</conf-loc>
        <fpage>816</fpage>  
        <lpage>819</lpage>  
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="https://link.springer.com/chapter/10.1007/978-3-642-36973-5_92"/>
        </comment>  
        <pub-id pub-id-type="doi">10.1007/978-3-642-36973-5_92</pub-id></nlm-citation>
      </ref>
      <ref id="ref61">
        <label>61</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Pang</surname>
            <given-names>B</given-names>
          </name>
          <name name-style="western">
            <surname>Lee</surname>
            <given-names>L</given-names>
          </name>
        </person-group>
        <article-title>Opinion mining and sentiment analysis</article-title>
        <source>Found Trends Inf Ret</source>  
        <year>2008</year>  
        <volume>2</volume>  
        <issue>1–2</issue>  
        <fpage>1</fpage>  
        <lpage>135</lpage>  
        <pub-id pub-id-type="doi">10.1561/1500000011</pub-id></nlm-citation>
      </ref>
      <ref id="ref62">
        <label>62</label>
        <nlm-citation citation-type="web">
        <source>Natural Language Toolkit</source>  
        <year>2017</year>  
        <access-date>2017-10-21</access-date>
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="http://www.nltk.org/">http://www.nltk.org/</ext-link>
          <ext-link ext-link-type="webcite" xlink:href="6yHdLox5S"/>
        </comment> </nlm-citation>
      </ref>
      <ref id="ref63">
        <label>63</label>
        <nlm-citation citation-type="web">
        <source>GNU Aspell</source>  
        <access-date>2018-03-29</access-date>
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="http://aspell.net/">http://aspell.net/</ext-link>
          <ext-link ext-link-type="webcite" xlink:href="6yHdUtryf"/>
        </comment> </nlm-citation>
      </ref>
      <ref id="ref64">
        <label>64</label>
        <nlm-citation citation-type="confproc">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Strohman</surname>
            <given-names>T</given-names>
          </name>
          <name name-style="western">
            <surname>Metzler</surname>
            <given-names>D</given-names>
          </name>
          <name name-style="western">
            <surname>Turtle</surname>
            <given-names>H</given-names>
          </name>
          <name name-style="western">
            <surname>Croft</surname>
            <given-names>W</given-names>
          </name>
        </person-group>
        <article-title>Indri: A language model-based search engine for complex queries</article-title>
        <year>2005</year>  
        <conf-name>Proceedings of the International Conference on Intelligent Analysis</conf-name>
        <conf-date>May 2 - 3, 2005</conf-date>
        <conf-loc>Washington, DC</conf-loc>
        <fpage>2</fpage>  
        <lpage>6</lpage>  
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="http://ciir.cs.umass.edu/pubfiles/ir-407.pdf"/>
        </comment> </nlm-citation>
      </ref>
      <ref id="ref65">
        <label>65</label>
        <nlm-citation citation-type="confproc">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Ounis</surname>
            <given-names>I</given-names>
          </name>
          <name name-style="western">
            <surname>Amati</surname>
            <given-names>G</given-names>
          </name>
          <name name-style="western">
            <surname>He</surname>
            <given-names>B</given-names>
          </name>
          <name name-style="western">
            <surname>Macdonald</surname>
            <given-names>C</given-names>
          </name>
        </person-group>
        <article-title>Terrier Information Retrieval Platform</article-title>
        <year>2005</year>  
        <conf-name>European Conference on Information Retrieval</conf-name>
        <conf-date>March 21-23, 2005</conf-date>
        <conf-loc>Santiago de Compostela, Spain</conf-loc>
        <fpage>517</fpage>  
        <lpage>519</lpage>  
        <pub-id pub-id-type="doi">10.1007/978-3-540-31865-1_37</pub-id></nlm-citation>
      </ref>
      <ref id="ref66">
        <label>66</label>
        <nlm-citation citation-type="confproc">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Feng</surname>
            <given-names>L</given-names>
          </name>
          <name name-style="western">
            <surname>Jansche</surname>
            <given-names>M</given-names>
          </name>
          <name name-style="western">
            <surname>Huenerfauth</surname>
            <given-names>M</given-names>
          </name>
          <name name-style="western">
            <surname>Elhadad</surname>
            <given-names>N</given-names>
          </name>
        </person-group>
        <article-title>A Comparison of Features for Automatic Readability Assessment</article-title>
        <year>2010</year>  
        <conf-name>Proceedings of the 23rd International Conference on Computational Linguistics</conf-name>
        <conf-date>August 23 - 27, 2010</conf-date>
        <conf-loc>Beijing, China</conf-loc>
        <fpage>276</fpage>  
        <lpage>284</lpage>  
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="http://people.dbmi.columbia.edu/noemie/papers/coling10.pdf"/>
        </comment> </nlm-citation>
      </ref>
      <ref id="ref67">
        <label>67</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Barzilay</surname>
            <given-names>R</given-names>
          </name>
          <name name-style="western">
            <surname>Lapata</surname>
            <given-names>M</given-names>
          </name>
        </person-group>
        <article-title>Modeling local coherence: an entity-based approach</article-title>
        <source>Comput Linguist</source>  
        <year>2008</year>  
        <month>03</month>  
        <volume>34</volume>  
        <issue>1</issue>  
        <fpage>1</fpage>  
        <lpage>34</lpage>  
        <pub-id pub-id-type="doi">10.1162/coli.2008.34.1.1</pub-id></nlm-citation>
      </ref>
      <ref id="ref68">
        <label>68</label>
        <nlm-citation citation-type="web">
        <source>Beautiful Soup</source>  
        <access-date>2018-03-29</access-date>
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="https://www.crummy.com/software/BeautifulSoup/">https://www.crummy.com/software/BeautifulSoup/</ext-link>
          <ext-link ext-link-type="webcite" xlink:href="6yHddZWCi"/>
        </comment> </nlm-citation>
      </ref>
      <ref id="ref69">
        <label>69</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Elhadad</surname>
            <given-names>N</given-names>
          </name>
        </person-group>
        <article-title>Comprehending technical texts: predicting and defining unfamiliar terms</article-title>
        <source>AMIA Annu Symp Proc</source>  
        <year>2006</year>  
        <fpage>239</fpage>  
        <lpage>243</lpage>  
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/17238339"/>
        </comment>  
        <pub-id pub-id-type="medline">17238339</pub-id>
        <pub-id pub-id-type="pii">86239</pub-id>
        <pub-id pub-id-type="pmcid">PMC1839621</pub-id></nlm-citation>
      </ref>
      <ref id="ref70">
        <label>70</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Wu</surname>
            <given-names>D</given-names>
          </name>
          <name name-style="western">
            <surname>Hanauer</surname>
            <given-names>D</given-names>
          </name>
          <name name-style="western">
            <surname>Mei</surname>
            <given-names>Q</given-names>
          </name>
          <name name-style="western">
            <surname>Clark</surname>
            <given-names>P</given-names>
          </name>
          <name name-style="western">
            <surname>An</surname>
            <given-names>L</given-names>
          </name>
          <name name-style="western">
            <surname>Proulx</surname>
            <given-names>J</given-names>
          </name>
          <name name-style="western">
            <surname>Zeng</surname>
            <given-names>Q</given-names>
          </name>
          <name name-style="western">
            <surname>Vydiswaran</surname>
            <given-names>V</given-names>
          </name>
          <name name-style="western">
            <surname>Collins-Thompson</surname>
            <given-names>K</given-names>
          </name>
          <name name-style="western">
            <surname>Zheng</surname>
            <given-names>K</given-names>
          </name>
        </person-group>
        <article-title>Assessing the readability of ClinicalTrials.gov</article-title>
        <source>J Am Med Inform Assoc</source>  
        <year>2016</year>  
        <month>03</month>  
        <volume>23</volume>  
        <issue>2</issue>  
        <fpage>269</fpage>  
        <lpage>275</lpage>  
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/26269536"/>
        </comment>  
        <pub-id pub-id-type="doi">10.1093/jamia/ocv062</pub-id>
        <pub-id pub-id-type="medline">26269536</pub-id>
        <pub-id pub-id-type="pii">ocv062</pub-id>
        <pub-id pub-id-type="pmcid">PMC5009924</pub-id></nlm-citation>
      </ref>
      <ref id="ref71">
        <label>71</label>
        <nlm-citation citation-type="web">
        <source>Reddit</source>  
        <access-date>2018-03-03</access-date>
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="https://www.reddit.com">https://www.reddit.com</ext-link>
          <ext-link ext-link-type="webcite" xlink:href="6yHdMrtgC"/>
        </comment> </nlm-citation>
      </ref>
      <ref id="ref72">
        <label>72</label>
        <nlm-citation citation-type="web">
        <source>Reddit</source>  
        <year>2017</year>  
        <access-date>2018-03-29</access-date>
        <comment>Ask a Doctor 
        <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="https://www.reddit.com/r/AskDocs/">https://www.reddit.com/r/AskDocs/</ext-link>
        <ext-link ext-link-type="webcite" xlink:href="6yHdhLy3x"/></comment> </nlm-citation>
      </ref>
      <ref id="ref73">
        <label>73</label>
        <nlm-citation citation-type="web">
        <source>PRAW: The Python Reddit API Wrapper</source>  
        <year>2017</year>  
        <access-date>2018-03-29</access-date>
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="https://praw.readthedocs.io/en/latest/">https://praw.readthedocs.io/en/latest/</ext-link>
          <ext-link ext-link-type="webcite" xlink:href="6yHdm8YI2"/>
        </comment> </nlm-citation>
      </ref>
      <ref id="ref74">
        <label>74</label>
        <nlm-citation citation-type="web">
        <source>Wikimedia</source>  
        <access-date>2018-03-29</access-date>
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="https://dumps.wikimedia.org/enwiki/">https://dumps.wikimedia.org/enwiki/</ext-link>
          <ext-link ext-link-type="webcite" xlink:href="6yHdZCKxJ"/>
        </comment> </nlm-citation>
      </ref>
      <ref id="ref75">
        <label>75</label>
        <nlm-citation citation-type="confproc">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Soldaini</surname>
            <given-names>L</given-names>
          </name>
          <name name-style="western">
            <surname>Cohan</surname>
            <given-names>A</given-names>
          </name>
          <name name-style="western">
            <surname>Yates</surname>
            <given-names>A</given-names>
          </name>
          <name name-style="western">
            <surname>Goharian</surname>
            <given-names>N</given-names>
          </name>
          <name name-style="western">
            <surname>Frieder</surname>
            <given-names>O</given-names>
          </name>
        </person-group>
        <article-title>Retrieving Medical Literature for Clinical Decision Support</article-title>
        <year>2015</year>  
        <conf-name>European Conference on Information Retrieval</conf-name>
        <conf-date>29 March - 2 April, 2015</conf-date>
        <conf-loc>Vienna, Austria</conf-loc>
        <fpage>538</fpage>  
        <lpage>549</lpage>  
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="https://link.springer.com/chapter/10.1007/978-3-319-16354-3_59"/>
        </comment> </nlm-citation>
      </ref>
      <ref id="ref76">
        <label>76</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Roberts</surname>
            <given-names>K</given-names>
          </name>
          <name name-style="western">
            <surname>Simpson</surname>
            <given-names>M</given-names>
          </name>
          <name name-style="western">
            <surname>Demner-Fushman</surname>
            <given-names>D</given-names>
          </name>
          <name name-style="western">
            <surname>Voorhees</surname>
            <given-names>E</given-names>
          </name>
          <name name-style="western">
            <surname>Hersh</surname>
            <given-names>W</given-names>
          </name>
        </person-group>
        <article-title>State-of-the-art in biomedical literature retrieval for clinical cases: a survey of the TREC 2014 CDS track</article-title>
        <source>Inf Retrieval J</source>  
        <year>2015</year>  
        <month>7</month>  
        <day>18</day>  
        <volume>19</volume>  
        <issue>1-2</issue>  
        <fpage>113</fpage>  
        <lpage>148</lpage>  
        <pub-id pub-id-type="doi">10.1007/s10791-015-9259-x</pub-id></nlm-citation>
      </ref>
      <ref id="ref77">
        <label>77</label>
        <nlm-citation citation-type="confproc">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Roberts</surname>
            <given-names>K</given-names>
          </name>
          <name name-style="western">
            <surname>Simpson</surname>
            <given-names>M</given-names>
          </name>
          <name name-style="western">
            <surname>Voorhees</surname>
            <given-names>E</given-names>
          </name>
          <name name-style="western">
            <surname>Hersh</surname>
            <given-names>W</given-names>
          </name>
        </person-group>
        <article-title>Overview of the TREC 2015 Clinical Decision Support Track</article-title>
        <year>2015</year>  
        <conf-name>Proceedings of The Twenty-Fourth Text REtrieval Conference</conf-name>
        <conf-date>November 17-20, 2015</conf-date>
        <conf-loc>Gaithersburg, Maryland, USA</conf-loc>
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="https://trec.nist.gov/pubs/trec24/papers/Overview-CL.pdf"/>
        </comment> </nlm-citation>
      </ref>
      <ref id="ref78">
        <label>78</label>
        <nlm-citation citation-type="confproc">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Kohlschütter</surname>
            <given-names>C</given-names>
          </name>
          <name name-style="western">
            <surname>Fankhauser</surname>
            <given-names>P</given-names>
          </name>
          <name name-style="western">
            <surname>Nejdl</surname>
            <given-names>W</given-names>
          </name>
        </person-group>
        <article-title>Boilerplate Detection using Shallow Text Features</article-title>
        <year>2010</year>  
        <conf-name>Proceedings of the third ACM international conference on Web search and data mining</conf-name>
        <conf-date>February 04 - 06, 2010</conf-date>
        <conf-loc>New York, New York, USA</conf-loc>
        <fpage>441</fpage>  
        <lpage>450</lpage>  
        <pub-id pub-id-type="doi">10.1145/1718487.1718542</pub-id></nlm-citation>
      </ref>
      <ref id="ref79">
        <label>79</label>
        <nlm-citation citation-type="web">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Pomikálek</surname>
            <given-names>J</given-names>
          </name>
        </person-group>
        <source>Vysokoškolské Kvalifikační Práce</source>  
        <year>2011</year>  
        <access-date>2018-11-29</access-date>
        <comment>Removing Boilerplate and Duplicate Content from Web Corpora 
        <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="https://theses.cz/id/nqo9nn/">https://theses.cz/id/nqo9nn/</ext-link>
        <ext-link ext-link-type="webcite" xlink:href="74HmjvbRt"/></comment> </nlm-citation>
      </ref>
      <ref id="ref80">
        <label>80</label>
        <nlm-citation citation-type="confproc">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Chen</surname>
            <given-names>T</given-names>
          </name>
          <name name-style="western">
            <surname>Guestrin</surname>
            <given-names>C</given-names>
          </name>
        </person-group>
        <article-title>XGBoost: A Scalable Tree Boosting System</article-title>
        <year>2016</year>  
        <conf-name>Proceedings of the 22nd ACM SIGKDD International Conference on Knowledge Discovery and Data Mining</conf-name>
        <conf-date>August 13 - 17, 2016</conf-date>
        <conf-loc>San Francisco, California, USA</conf-loc>
        <fpage>785</fpage>  
        <lpage>794</lpage>  
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="https://www.kdd.org/kdd2016/papers/files/rfp0697-chenAemb.pdf"/>
        </comment>  
        <pub-id pub-id-type="doi">10.1145/2939672.2939785</pub-id></nlm-citation>
      </ref>
      <ref id="ref81">
        <label>81</label>
        <nlm-citation citation-type="confproc">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Cormack</surname>
            <given-names>G</given-names>
          </name>
          <name name-style="western">
            <surname>Clarke</surname>
            <given-names>C</given-names>
          </name>
          <name name-style="western">
            <surname>Buettcher</surname>
            <given-names>S</given-names>
          </name>
        </person-group>
        <article-title>Reciprocal rank fusion outperforms condorcet and individual rank learning methods</article-title>
        <year>2009</year>  
        <conf-name>Proceedings of the 32nd international ACM SIGIR conference on Research and development in information retrieval</conf-name>
        <conf-date>July 19 - 23, 2009</conf-date>
        <conf-loc>Boston, MA, USA</conf-loc>
        <fpage>758</fpage>  
        <lpage>759</lpage>  
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.150.2291&amp;rep=rep1&amp;type=pdf"/>
        </comment>  
        <pub-id pub-id-type="doi">10.1145/1571941.1572114</pub-id></nlm-citation>
      </ref>
      <ref id="ref82">
        <label>82</label>
        <nlm-citation citation-type="confproc">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Song</surname>
            <given-names>Y</given-names>
          </name>
          <name name-style="western">
            <surname>He</surname>
            <given-names>Y</given-names>
          </name>
          <name name-style="western">
            <surname>Hu</surname>
            <given-names>Q</given-names>
          </name>
          <name name-style="western">
            <surname>He</surname>
            <given-names>L</given-names>
          </name>
          <name name-style="western">
            <surname>Haacke</surname>
            <given-names>E</given-names>
          </name>
        </person-group>
        <article-title>ECNU at 2015 eHealth Task 2: User-centred Health Information Retrieval</article-title>
        <year>2015</year>  
        <conf-name>Conference and Labs of the Evaluation Forum</conf-name>
        <conf-date>September 8-11, 2015</conf-date>
        <conf-loc>Toulouse, France</conf-loc>
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="http://ceur-ws.org/Vol-1391/80-CR.pdf"/>
        </comment> </nlm-citation>
      </ref>
      <ref id="ref83">
        <label>83</label>
        <nlm-citation citation-type="confproc">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Oh</surname>
            <given-names>H</given-names>
          </name>
          <name name-style="western">
            <surname>Jung</surname>
            <given-names>Y</given-names>
          </name>
          <name name-style="western">
            <surname>Kim</surname>
            <given-names>K</given-names>
          </name>
        </person-group>
        <article-title>KISTI at CLEF eHealth 2015 Task 2</article-title>
        <year>2015</year>  
        <conf-name>Conference and Labs of the Evaluation Forum</conf-name>
        <conf-date>September 8-11, 2015</conf-date>
        <conf-loc>Toulouse, France</conf-loc>
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="http://ceur-ws.org/Vol-1391/17-CR.pdf"/>
        </comment> </nlm-citation>
      </ref>
      <ref id="ref84">
        <label>84</label>
        <nlm-citation citation-type="confproc">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Soldaini</surname>
            <given-names>L</given-names>
          </name>
          <name name-style="western">
            <surname>Edman</surname>
            <given-names>W</given-names>
          </name>
          <name name-style="western">
            <surname>Goharian</surname>
            <given-names>N</given-names>
          </name>
        </person-group>
        <article-title>Team GU-IRLAB at CLEF eHealth 2016: Task 3</article-title>
        <year>2016</year>  
        <conf-name>Conference and Labs of the Evaluation Forum</conf-name>
        <conf-date>5-8 September, 2016</conf-date>
        <conf-loc>Évora, Portugal</conf-loc>
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="http://ceur-ws.org/Vol-1609/16090143.pdf"/>
        </comment> </nlm-citation>
      </ref>
      <ref id="ref85">
        <label>85</label>
        <nlm-citation citation-type="confproc">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Song</surname>
            <given-names>Y</given-names>
          </name>
          <name name-style="western">
            <surname>He</surname>
            <given-names>Y</given-names>
          </name>
          <name name-style="western">
            <surname>Liu</surname>
            <given-names>H</given-names>
          </name>
          <name name-style="western">
            <surname>Wang</surname>
            <given-names>Y</given-names>
          </name>
          <name name-style="western">
            <surname>Hu</surname>
            <given-names>Q</given-names>
          </name>
          <name name-style="western">
            <surname>He</surname>
            <given-names>L</given-names>
          </name>
        </person-group>
        <article-title>ECNU at 2016 eHealth Task 3: Patient-centred Information Retrieval</article-title>
        <year>2016</year>  
        <conf-name>Conference and Labs of the Evaluation Forum</conf-name>
        <conf-date>5-8 September, 2016</conf-date>
        <conf-loc>Évora, Portugal</conf-loc>
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="http://ceur-ws.org/Vol-1609/16090157.pdf"/>
        </comment> </nlm-citation>
      </ref>
      <ref id="ref86">
        <label>86</label>
        <nlm-citation citation-type="confproc">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Palotti</surname>
            <given-names>J</given-names>
          </name>
          <name name-style="western">
            <surname>Zuccon</surname>
            <given-names>G</given-names>
          </name>
          <name name-style="western">
            <surname>Hanbury</surname>
            <given-names>A</given-names>
          </name>
        </person-group>
        <article-title>MM: A new Framework for Multidimensional Evaluation of Search Engines</article-title>
        <year>2018</year>  
        <month>10</month>  
        <conf-name>Proceedings of the 27th ACM International Conference on Information and Knowledge Management</conf-name>
        <conf-date>October 22 - 26, 2018</conf-date>
        <conf-loc>Torino, Italy</conf-loc>
        <fpage>1699</fpage>  
        <lpage>1702</lpage>  
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="https://dl.acm.org/citation.cfm?id=3269261"/>
        </comment>  
        <pub-id pub-id-type="doi">10.1145/3269206.3269261</pub-id></nlm-citation>
      </ref>
      <ref id="ref87">
        <label>87</label>
        <nlm-citation citation-type="confproc">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Sakai</surname>
            <given-names>T</given-names>
          </name>
        </person-group>
        <article-title>Alternatives to Bpref</article-title>
        <year>2007</year>  
        <conf-name>Proceedings of the 30th annual international ACM SIGIR conference on Research and development in information retrieval</conf-name>
        <conf-date>July 23 - 27, 2007</conf-date>
        <conf-loc>Amsterdam, The Netherlands</conf-loc>
        <fpage>71</fpage>  
        <lpage>78</lpage>  
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="https://dl.acm.org/citation.cfm?doid=1277741.1277756"/>
        </comment>  
        <pub-id pub-id-type="doi">10.1145/1277741.1277756</pub-id></nlm-citation>
      </ref>
      <ref id="ref88">
        <label>88</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Sanderson</surname>
            <given-names>M</given-names>
          </name>
        </person-group>
        <article-title>Test collection based evaluation of information retrieval systems</article-title>
        <source>Found Trends Inf Ret</source>  
        <year>2010</year>  
        <volume>4</volume>  
        <issue>4</issue>  
        <fpage>247</fpage>  
        <lpage>375</lpage>  
        <pub-id pub-id-type="doi">10.1561/1500000009</pub-id></nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
