<?xml version="1.0"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article" dtd-version="2.0">
	<front>
		<journal-meta>
			<journal-id journal-id-type="publisher-id">JMIR</journal-id>
			<journal-id journal-id-type="nlm-ta">J Med Internet Res</journal-id>
			<journal-title>Journal of Medical Internet Research</journal-title>
			<issn pub-type="epub">1438-8871</issn>
			<publisher>
				<publisher-name>Gunther Eysenbach</publisher-name>
				<publisher-loc>Centre for Global eHealth Innovation, Toronto, Canada</publisher-loc>
			</publisher>
		</journal-meta>
		<article-meta>
			<article-id pub-id-type="publisher-id">v9i1e4</article-id>
			<article-id pub-id-type="pmid">17478413</article-id>
			<article-id pub-id-type="doi">10.2196/jmir.9.1.e4</article-id>
			<!-- JMIR ms # 606 -->
			<article-categories>
				<subj-group subj-group-type="article-type">
					<subject>Original Paper</subject>
				</subj-group>
			</article-categories>
			<title-group>
				<article-title>Term Identification Methods for Consumer Health Vocabulary Development</article-title>
			</title-group>
						<contrib-group>
							<contrib contrib-type="reviewer">
								<name>
									<surname>Patrick</surname>
									<given-names>Timothy</given-names>
								</name>
							</contrib>
						</contrib-group>

			<contrib-group>
				<contrib contrib-type="author" id="contrib1" corresp="yes">
					<name name-style="western">
						<surname>Zeng</surname>
						<given-names>Qing T</given-names>
					</name>
					<degrees>PhD</degrees>
					<xref ref-type="aff" rid="aff1">1</xref>
					<address>
						<addr-line>Decision Systems Group</addr-line>
						<addr-line>Brigham and Women's Hospital</addr-line>
						<institution>Harvard Medical School</institution>
						<addr-line>Thorn 304, 75 Francis Street</addr-line>
						<addr-line>Boston, MA 02115</addr-line>
						<country>USA</country>
						<phone>+1 617 732 7694</phone>
						<fax>+1 617 739 3672</fax>
						<email>qzeng@dsg.harvard.edu</email>
					</address>
				</contrib>
				<contrib contrib-type="author" id="contrib2">
					<name name-style="western">
						<surname>Tse</surname>
						<given-names>Tony</given-names>
					</name>
					<degrees>PhD</degrees>
					<xref ref-type="aff" rid="aff2">2</xref>
				</contrib>
				<contrib contrib-type="author" id="contrib3">
					<name name-style="western">
						<surname>Divita</surname>
						<given-names>Guy</given-names>
					</name>
					<degrees>MS</degrees>
					<xref ref-type="aff" rid="aff2">2</xref>
					<xref ref-type="aff" rid="aff3">3</xref>
				</contrib>
				<contrib contrib-type="author" id="contrib4">
					<name name-style="western">
						<surname>Keselman</surname>
						<given-names>Alla</given-names>
					</name>
					<degrees>PhD</degrees>
					<xref ref-type="aff" rid="aff2">2</xref>
					<xref ref-type="aff" rid="aff4">4</xref>
				</contrib>
				<contrib contrib-type="author" id="contrib5">
					<name name-style="western">
						<surname>Crowell</surname>
						<given-names>Jon</given-names>
					</name>
					<degrees>MS</degrees>
					<xref ref-type="aff" rid="aff1">1</xref>
				</contrib>
				<contrib contrib-type="author" id="contrib6">
					<name name-style="western">
						<surname>Browne</surname>
						<given-names>Allen C</given-names>
					</name>
					<degrees>MS</degrees>
					<xref ref-type="aff" rid="aff2">2</xref>
				</contrib>
				<contrib contrib-type="author" id="contrib7">
					<name name-style="western">
						<surname>Goryachev</surname>
						<given-names>Sergey</given-names>
					</name>
					<degrees>MS</degrees>
					<xref ref-type="aff" rid="aff1">1</xref>
				</contrib>
				<contrib contrib-type="author" id="contrib8">
					<name name-style="western">
						<surname>Ngo</surname>
						<given-names>Long</given-names>
					</name>
					<degrees>PhD</degrees>
					<xref ref-type="aff" rid="aff1">1</xref>
				</contrib>
			</contrib-group>
			<aff id="aff4" rid="aff4">
				<sup>4</sup>
				<addr-line>Aquilent</addr-line>
				<addr-line>Inc</addr-line>
				<addr-line>Laurel</addr-line>
				<addr-line>MD</addr-line>

				<country>USA</country>
			</aff>
			<aff id="aff3" rid="aff3">
				<sup>3</sup>
				<addr-line>Management Systems Designers</addr-line>
				<addr-line>Inc</addr-line>
				<addr-line>Fairfax</addr-line>
				<addr-line>VA</addr-line>
				<country>USA</country>
			</aff>
			<aff id="aff2" rid="aff2">
				<sup>2</sup>
				<addr-line>LHNCBC</addr-line>
				<addr-line>National Library of Medicine</addr-line>
				<addr-line>NIH</addr-line>
				<addr-line>DHHS</addr-line>
				<addr-line>Bethesda</addr-line>
				<addr-line>MD</addr-line>
				<country>USA</country>
			</aff>
			<aff id="aff1" rid="aff1">
				<sup>1</sup>
				<addr-line>Decision Systems Group</addr-line>
				<addr-line>Brigham and Women&#x2019;s Hospital</addr-line>
				<institution>Harvard Medical School</institution>
				<addr-line>Boston</addr-line>
				<addr-line>MA</addr-line>
				<country>USA</country>
			</aff>
			<pub-date pub-type="collection">
				<season>Jan-Mar</season>
				<year>2007</year>
			</pub-date>
			<pub-date pub-type="epub">
				<day>14</day>
				<month>3</month>
				<year>2007</year>
			</pub-date>
			<volume>9</volume>
			<issue>1</issue>
			<elocation-id>e4</elocation-id>
			<history>
				<date date-type="received">
					<day>23</day>
					<month>10</month>
					<year>2006</year>
				</date>
				<date date-type="rev-request">
					<day>21</day>
					<month>11</month>
					<year>2006</year>
				</date>
				<date date-type="rev-recd">
					<day>21</day>
					<month>11</month>
					<year>2006</year>
				</date>
				<date date-type="accepted">
					<day>22</day>
					<month>02</month>
					<year>2007</year>
				</date>
			</history>
			<copyright-statement>&#xA9; Qing T Zeng, Tony Tse, Guy Divita, Alla Keselman, Jon Crowell, Allen C Browne, Sergey Goryachev, Long Ngo.  Originally published in the Journal of Medical Internet Research (http://www.jmir.org), 14.03.2007.  Except where otherwise noted, articles published in the Journal of Medical Internet Research are distributed under the terms of the Creative Commons Attribution License (http://www.creativecommons.org/licenses/by/2.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work is properly cited, including full bibliographic	details and the URL (see "please cite as" above), and this statement is included.
            </copyright-statement>
			<copyright-year>2007</copyright-year>
			<self-uri xlink:href="http://www.jmir.org/2007/1/e4/" xlink:type="simple"/>
			<abstract>
				<sec sec-type="background">
					<title>Background</title>
					<p> The development of consumer health information applications such as health education websites has motivated the research on consumer health vocabulary (CHV). Term identification is a critical task in vocabulary development. Because of the heterogeneity and ambiguity of consumer expressions, term identification for CHV is more challenging than for professional health vocabularies.</p>
				</sec>
				<sec sec-type="objective">
					<title>Objective</title>
					<p> For the development of a CHV, we explored several term identification methods, including collaborative human review and automated term recognition methods.</p>
				</sec>
				<sec sec-type="methods">
					<title>Methods</title>
					<p> A set of criteria was established to ensure consistency in the collaborative review, which analyzed 1893 strings. Using the results from the human review, we tested two automated methods&#x2014;C-value formula and a logistic regression model.</p>
				</sec>
				<sec sec-type="results">
					<title>Results</title>
					<p> The study identified 753 consumer terms and found the logistic regression model to be highly effective for CHV term identification (area under the receiver operating characteristic curve = 95.5%).</p>
				</sec>
				<sec sec-type="conclusions">
					<title>Conclusions</title>
					<p> The collaborative human review and logistic regression methods were effective for identifying terms for CHV development.</p>
				</sec>
			</abstract>
			<kwd-group>
				<kwd>Consumer health information</kwd>
				<kwd>vocabulary</kwd>
				<kwd>natural language processing</kwd>
			</kwd-group>
		</article-meta>
	</front>
	<body>
		<sec sec-type="introduction">
			<title>Introduction</title>
			<p>Two important steps in vocabulary development are (1) the identification of candidate strings (ie, words or phrases) in a domain and (2) the determination of which of these should be included in a vocabulary as &#x201C;valid&#x201D; terms, also called &#x201C;termhood determination.&#x201D; Health vocabulary development, which has a long history, requires significant effort for collecting candidate terms and determining termhood [<xref ref-type="bibr" rid="ref1">1</xref>]. While vocabularies such as SNOMED (Systematized Nomenclature of Medicine) and ICD-9 (International Classification of Diseases, Ninth Revision) include many health terms, there is no consensus on termhood criteria (ie, what constitutes a &#x201C;term&#x201D;) [<xref ref-type="bibr" rid="ref2">2</xref>]. The decision to include terms in a vocabulary is made for a particular domain for certain tasks (eg, indexing or billing). Thus, the review criteria and procedures used by vocabulary developers, which are often not published, inevitably differ. Terms included in health vocabularies also vary significantly. For instance, in the Unified Medical Language System (UMLS), the same concept is often represented in various source vocabularies by different terms. The terms &#x201C;head ache&#x201D; and &#x201C;cranial pain&#x201D; are both synonyms of the UMLS concept &#x201C;headache.&#x201D; The source vocabulary for &#x201C;head ache&#x201D; is DXplain, and the source vocabulary for &#x201C;cranial pain&#x201D; is MeSH (medical subject heading).</p>
			<p>Research and development of controlled consumer health vocabularies (CHVs) is a relatively new endeavor in the health vocabulary field [<xref ref-type="bibr" rid="ref3">3</xref>]. In the general biomedical literature, research on consumer understanding of medical words and concepts has focused primarily on relatively short lists of discrete terms in various specialties. In the informatics domain, a few companies (eg, Apelon and WellMed) offer proprietary CHV products, though these products have not been publicly evaluated.</p>
			<p>The general goal of our CHV research is to help overcome the vocabulary gap between consumers and health information provided by informatics applications. The specific aim of this paper is to elucidate term identification methods for CHVs. CHV research has largely been driven by the proliferation of health-related materials on the Web, the emergence of electronic personal health records, as well as the growing availability of various consumer health applications (eg, decision support tools). Over the past five years, researchers have found that consumer terms are not well covered by the existing health vocabularies, which mostly represent the language of health professionals [<xref ref-type="bibr" rid="ref4">4</xref>-<xref ref-type="bibr" rid="ref9">9</xref>]. Indeed, expressions used by consumers to describe health-related concepts and relationships among such concepts frequently differ on multiple levels (ie, syntactic, conceptual, and explanatory) from those of professionals. Thus, consumer health informatics research and application development will benefit from the development of CHVs.</p>
			<p>Developing and validating a comprehensive CHV is challenging because &#x201C;consumers&#x201D; constitute a plethora of highly diverse groups. Further, individuals uniquely acquire health-related terms and concepts from formal and informal sources (eg, media exposure) and from personal experiences. Nevertheless, there is strong evidence of the stability of lay health language among particular populations, for specific tasks [<xref ref-type="bibr" rid="ref3">3</xref>].</p>
			<p>We have been working on an open access and collaborative (OAC) CHV project. The first step in creating the OAC CHV was to identify consumer terms since surface forms, represented as strings in written text, are more tractable than concepts (ie, underlying meanings) or semantic relations, both of which require in-depth understanding of term usage, rhetorical intent, and explanatory models. Because consumer terms are heterogeneous and even less well defined than professional terms [<xref ref-type="bibr" rid="ref10">10</xref>], the termhood determination task proved to be particularly challenging. Our term identification effort has been guided by two principles:</p>
			<p>1. CHVs consist of actual terms commonly used by consumers (in any particular discourse group).</p>
			<p>2. CHV terms must allow for computer processing of consumer language.</p>
			<p>Since many professional health vocabulary terms are already used by consumers, though in some cases with different or broader semantics (eg, &#x201C;diabetes&#x201D; for diabetes mellitus, types 1 and 2), we focused on consumer terms not yet represented in existing vocabularies (eg, &#x201C;broken finger&#x201D; for any type of fracture in the &#x201C;distal,&#x201D; &#x201C;middle,&#x201D; or &#x201C;proximal phalanges&#x201D;).</p>
			<p>Because the number of candidate strings is often very large in any domain, researchers have explored the use of corpus-based automated term recognition (ATR) methods for extracting the most promising strings for human review from domain-specific documents [<xref ref-type="bibr" rid="ref1">11</xref>, <xref ref-type="bibr" rid="ref12">12</xref>]. ATRs vary from statistical or information theory&#x2013;based approaches (eg, <italic>t</italic> test) [<xref ref-type="bibr" rid="ref13">13</xref>] to syntax-based methods (eg, noun phrase extraction and context analysis) [<xref ref-type="bibr" rid="ref14">14</xref>] and hybrid mechanisms (eg, C-value formula) [<xref ref-type="bibr" rid="ref15">15</xref>, <xref ref-type="bibr" rid="ref16">16</xref>]. Both the <italic>t</italic> test and the C-value formula have been used successfully in termhood determination. Such studies reinforce the general notion that strings typically considered as terms share some common characteristics, such as words in a term tend to occur more frequently together, terms are often noun phrases, and terms may be part of several longer strings.</p>
			<p>In the biomedical domain, ATR methods have been applied to Medline literature [<xref ref-type="bibr" rid="ref17">17</xref>] and clinical reports [<xref ref-type="bibr" rid="ref15">15</xref>]. While most ATR methods outside the biomedical domain were designed to be general purpose, biomedical ATR methods tend to be more narrowly focused [<xref ref-type="bibr" rid="ref18">18</xref>]. The type of terms targeted by ATR vary, including gene and protein names in a number of recent studies [<xref ref-type="bibr" rid="ref18">18</xref>-<xref ref-type="bibr" rid="ref21">21</xref>].</p>
			<p>In this study, we first identified CHV terms through collaborative review of strings derived from query logs of a consumer health site [<xref ref-type="bibr" rid="ref22">22</xref>]. Because of the considerable variability in lay health expressions, standardized review criteria and procedures to ensure consistency in selecting CHV terms were developed. After obtaining the human-reviewed n-grams (ie, n word strings), we experimented with two ATR methods: logistic regression and the C-value formula. The initial features used in the regression model were informed by existing ATR methods, in particular, the C-value model [<xref ref-type="bibr" rid="ref16">16</xref>] and the termhood formula proposed by Wermter and Hahn [<xref ref-type="bibr" rid="ref12">12</xref>]. We also evaluated the popular C-value method.</p>
			<p>Our use of ATRs in this study differs from that in prior studies in the biomedical domain in two aspects: (1) short phrases from query logs were used as the text corpus rather than entire sentences from full-text sources, and (2) &#x201C;new&#x201D; CHV terms, not yet part of existing vocabularies, were identified rather than &#x201C;pre-existing&#x201D; terms such as UMLS terms.</p>
		</sec>
		<sec sec-type="methods">
			<title>Methods</title>
			<p>The term identification study had three components:</p>
			<list list-type="order">
				<list-item>
					<p>Candidate string extraction from a query log data set of terms that could not be mapped to UMLS</p>
				</list-item>
				<list-item>
					<p>Collaborative manual review of a subset of the candidate strings and identification of CHV terms</p>
				</list-item>
				<list-item>
					<p>Application of ATR methods (the C-value formula and logistic regression models) to human-reviewed CHV terms</p>
				</list-item>
			</list>
			<sec>
				<title>Candidate String Extraction</title>
				<p>We obtained a set of query log files [<xref ref-type="bibr" rid="ref22">22</xref>] from the MedlinePlus site covering the period from October 2002 to October 2003, courtesy of the National Library of Medicine (NLM). The log data were preprocessed to filter out all queries that were not in English, appeared to be machine generated (eg, very large numbers of queries from the same IP address within a minute), and that were redundant (ie, from the same host at time intervals of less than 5 minutes).</p>
				<p>The preprocessed queries were then mapped to the 2004AA version of the UMLS Metathesaurus using lexical methods (ie, removing non-alphanumeric symbols, stemming, normalization, and truncation). Queries that did not map to the UMLS Metathesaurus were broken into n-grams. N-grams that matched terms in the Metathesaurus were removed, and the remaining n-grams were collected into sets by frequency and number of words.</p>
				<p>We used n-gram analysis to find candidate terms from unmapped query strings. The n-gram analysis uses the frequencies of n-grams and text fragments of n words in a text sample to estimate the likelihood that a string is a potential term. In general, the more frequently an n-gram appears in text documents, the increased likelihood that the n-gram is a &#x201C;useful&#x201D; term.</p>
			</sec>
			<sec>
				<title>Collaborative Manual Review</title>
				<p>Six researchers (first six of the authors) reviewed candidate strings (n-grams) collaboratively. First, each reviewer independently reviewed a subset of the n-grams (n = 1 to 4 and frequency &gt; 50) and voted on whether they should be considered CHV terms. Unanimous votes for n-grams that were reviewed by at least three people were entered as &#x201C;master&#x201D; votes. Otherwise, termhood was discussed by the entire group until consensus was reached and a master vote was cast. To support reviewers from geographically distributed locations and to calculate votes, a specially designed Web-based application [<xref ref-type="bibr" rid="ref23">23</xref>] was utilized (<xref ref-type="fig" rid="figure1">Figure 1</xref>).</p>
				<p>
					<fig id="figure1" position="float">
						<label>Figure 1</label>
						<caption>
							<p>Application to support collaborative manual review of candidate strings</p>
						</caption>
						<graphic xlink:href="jmir_v9i1e4_fig1.jpg" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
					</fig>
				</p>
				<p>Through several iterations of votes and discussion, we established the following review criteria:</p>
				<list list-type="order">
					<list-item>
						<p>CHV terms should be syntactic constituents or phrases such as a noun phrase or adjective phrase (eg, &#x201C;bypass surgery&#x201D; is a phrase, but &#x201C;fever in&#x201D; is not). Special attention should be given to noun phrases.</p>
					</list-item>
					<list-item>
						<p>CHV terms should have independent semantics and should not only occur as a part of longer valid terms or as a part of wild card searches (eg, [chicken-, small-] &#x201C;pox vaccine&#x201D; is not considered a CHV term).</p>
					</list-item>
					<list-item>
						<p>CHV terms should be specific to the medical domain (eg, &#x201C;Google&#x201D; and &#x201C;Yahoo&#x201D; are general words, not CHV terms).</p>
					</list-item>
					<list-item>
						<p>CHV terms should function as semantic components in addition to functioning as syntactic components (eg, stop words &#x201C;the&#x201D; and &#x201C;a&#x201D; as well as empty verbs &#x201C;make&#x201D; and &#x201C;take&#x201D; are not considered CHV terms).</p>
					</list-item>
					<list-item>
						<p>N-grams representing existing UMLS medical concepts are considered to be CHV terms, but CHV terms may represent non-UMLS concepts.</p>
					</list-item>
					<list-item>

						<p>Eponymous forms of CHV terms are considered to be CHV terms (eg, &#x201C;Parkinson&#x2019;s&#x201D;).</p>
					</list-item>
					<list-item>
						<p>CHV terms may include spelling errors, (eg, &#x201C;Chron's disease&#x201D;). These misspelled terms are given the label &#x201C;disparaged.&#x201D;</p>
					</list-item>
					<list-item>
						<p>Terms with distinct clinical semantics (eg, &#x201C;result&#x201D;) are considered to be CHV terms, regardless of ambiguity and/or vagueness in other domains.</p>
					</list-item>
				</list>
				<p>We singled out several types of terms for future investigation and assigned special labels to them:</p>
				<list list-type="bullet">
					<list-item>
						<p>meta: A term that is usually used to indicate the category/type of information sought or presented (eg, &#x201C;picture,&#x201D; &#x201C;guideline,&#x201D; and &#x201C;tutorial&#x201D;).</p>
					</list-item>
					<list-item>
						<p> modifier: A term not typically used by itself, but for limiting or qualifying other terms (eg, &#x201C;sexually&#x201D; as in &#x201C;sexually active&#x201D;).</p>
					</list-item>
					<list-item>
						<p>relation: A term not typically used by itself, but used to describe relations among concepts (eg, &#x201C;caused by&#x201D; and &#x201C;results in&#x201D;). We also include the unary relation &#x201C;not&#x201D; in this set.</p>
					</list-item>
				</list>
				<p>Currently, we consider terms classified as meta and modifier to be CHV terms, but relations are not considered CHV terms.</p>
				<p>Once these review criteria were established, researchers double-checked the previously cast master votes for compliance. A second round of discussion resulted in some adjustments to the votes.</p>
			</sec>
			<sec>
				<title>Application of Automated Term Recognition (ATR)</title>
				<p>We explored the use of two ATR methods to facilitate candidate selection for human review: (1) the C-value method (C loosely stands for &#x201C;candidate collection&#x201D;) and (2) logistic regression.</p>
				<p>We applied the C-value method to the strings that had already been reviewed. First, the strings were parsed to filter out single-word strings and strings that were not noun phrases. The C-value was calculated using the formula [<xref ref-type="bibr" rid="ref16">16</xref>] given in Textbox 1.</p>
				<boxed-text id="box1" position="float">
					<title>The C-value was calculated using this formula</title>
					<p><italic>C-value(<bold>a</bold>) =</italic> log<sub>&#xAD;&#xAD;&#xAD;2</sub><italic>|<bold>a</bold>|*f(<bold>a</bold>)</italic> if <bold><italic>a</italic></bold> is not nested</p>
					<p>(When <italic><bold>a</bold></italic> is a substring of <bold><italic>b,</italic></bold> we refer to <italic><bold>a</bold></italic> as nested and <italic><bold>b</bold></italic> as <italic><bold>a</bold></italic>&#x2019;s nesting string.)</p>
					<p><italic>C-value(<bold>a</bold>) =</italic> log&#xAD;&#xAD;&#xAD;<sub>2</sub><italic>|<bold>a</bold>|*(f(<bold>a</bold>) &#x2013; 1/p(T<bold>a</bold>)*</italic>sum<italic>(f(<bold>b</bold>)))</italic> if <bold><italic>a</italic></bold> is nested</p>
					<p><italic><bold>a</bold> =</italic> candidate string (eg, &#x201C;failure&#x201D;)</p>
					<p><italic><bold>b</bold> =</italic> nesting strings (eg, &#x201C;heart failure&#x201D;)</p>
					<p><italic>|<bold>a</bold>| =</italic> length (number of words) of <italic><bold>a</bold></italic></p>
					<p><italic>f(<bold>a</bold>) =</italic> frequency of <italic><bold>a</bold></italic> in the corpus</p>
					<p><italic>T<bold>a</bold> =</italic> set of <italic><bold>b</bold></italic> that contain <italic><bold>a</bold></italic></p>
					<p><italic>P(T<bold>a</bold>) =</italic> number of <italic><bold>b</bold></italic> in <italic>T<bold>a</bold>
                        </italic></p>
					<p><italic>f(<bold>b</bold>) =</italic> frequency of <italic><bold>b</bold></italic> in the corpus</p>
				</boxed-text>
				<p>To create the logistic regression model that predicts the termhood of a candidate string <bold><italic>a,</italic></bold> we explored syntactic category, frequency of occurrence, string length, word count and number, frequency and termhood status of <bold><italic>a</italic></bold>&#x2019;s nesting, and nested strings as variables and used the master vote as outcome. Human-reviewed strings were used as the training and testing data sets. The initial feature variables were as follows:</p>
					<list list-type="order">
						<list-item>
							<p>part-of-speech (POS) tag (eg, noun or adjective) of the first word</p>
						</list-item>
						<list-item>
							<p>POS tag of the last word</p>
						</list-item>
						<list-item>
							<p>noun phrase status (ie, yes/no)</p>
						</list-item>
						<list-item>
							<p>word count (ie, number of words in <bold><italic>a</italic>)</bold>
                            </p>
						</list-item>
						<list-item>
							<p>number of distinct <bold><italic>a</italic></bold>&#x2019;s nesting string <bold><italic>b</italic></bold>
                            </p>
						</list-item>
						<list-item>
							<p>number of repeated <bold><italic>b</italic></bold>
                            </p>
						</list-item>
						<list-item>
							<p>percentage of distinct <bold><italic>b</italic></bold> that are known valid (UMLS) terms</p>
						</list-item>
						<list-item>
							<p>percentage of repeated <bold><italic>b</italic></bold> that are known valid (UMLS) terms</p>
						</list-item>
						<list-item>
							<p>number of distinct <bold><italic>a</italic></bold>&#x2019;s nested string <bold><italic>c</italic></bold>
                            </p>
						</list-item>
						<list-item>
							<p> number of repeated <bold><italic>c</italic></bold>
                            </p>
						</list-item>

						<list-item>
							<p>percentage of distinct <bold><italic>c</italic></bold> that are known valid (UMLS) terms</p>
						</list-item>
						<list-item>
							<p>percentage of repeated <bold><italic>c</italic></bold> that are known valid (UMLS) terms</p>
						</list-item>
						<list-item>
							<p>frequency of <bold><italic>a</italic></bold>
                            </p>
						</list-item>
						<list-item>
							<p>number of distinct host <bold><italic>h</italic></bold> that <bold><italic>a</italic></bold> originated from</p>
						</list-item>
						<list-item>
							<p>average number of distinct queries containing <bold><italic>a</italic></bold> per host</p>
						</list-item>
					</list>
				<p>The frequency distribution of the POS tags (variables 1 and 2) required them to be collapsed into fewer categories for modeling. The original tags came from a Brill-style, rule-based POS tagger developed by Mark Hepple [<xref ref-type="bibr" rid="ref24">24</xref>]. We first transformed them into a smaller set of tags used by the UMLS SPECIALIST Lexicon of the National Library of Medicine (NLM) [<xref ref-type="bibr" rid="ref25">25</xref>]. (Details of the transformation rules can be found in [<xref ref-type="bibr" rid="ref26">26</xref>].) Several tags appeared with low frequency and were then merged: the tags AUXILARY and MODAL were merged with VERB, and the tags CONJUNCTION, DETERMINER, NUMBER, SYM, UNKNOWN, PRONOUN, and PREP were merged into a new category, OTHER.</p>
				<p>The continuous variables (variables 4 to 15) were dichotomized based on the median value. The dichotomized variables were used in the logistic regression to predict or explain the probability of having a term voted &#x201C;yes&#x201D; for termhood.</p>
				<p>The logistic regression model building was carried out by a stepwise procedure. After calculating the odds ratio estimates, most of the variables were dropped. The remaining variables 1, 2, 3, 6, 10, and 15 were represented in the regression formula as FirstPOS, LastPOS, np_value, repeat_sup_gt_median, repeat_sub_gt_median, and distinct_perhost_gt_median.</p>
				<p>For both the C-value formula and the regression model, we calculated the sensitivity and specificity at different thresholds to create the receiver operating characteristic (ROC) curves. To estimate the area under the ROC curve for the logistic regression, we used the c-statistic [<xref ref-type="bibr" rid="ref27">27</xref>] (note that this is not the same as C-value). It has the following meaning. From the final multivariable logistic regression model, the predicted probability of the termhood voted &#x201C;yes&#x201D; can be computed for each term. For any two terms, one with vote &#x201C;yes&#x201D; and one with vote &#x201C;no,&#x201D; if the predicted probability for vote &#x201C;yes&#x201D; is higher than the predicted probability for vote &#x201C;no,&#x201D; then we have a concordant pair. If the predicted probability of vote &#x201C;no&#x201D; is higher, then we have a discordant pair. If the pair is neither concordant nor discordant, then it is tied. Let <italic>T</italic> be the total number of all possible yes-no pairs of all terms. Let <italic>C</italic> be the number of concordant pairs, and <italic>D</italic> the number of discordant pairs. The c-statistic is calculated as <italic>c</italic> = (<italic>C</italic> + 0.5(<italic>T</italic> &#x2212; <italic>C</italic> &#x2212; <italic>D</italic>)) / <italic>T</italic>.</p>
			</sec>
		</sec>
		<sec sec-type="results">
			<title>Results</title>
			<p>We identified 18454 candidate n-grams (n = 1 to 5); 7967 were reviewed by at least one reviewer, and 1893 distinct n-grams received master votes (<xref ref-type="table" rid="table1">Table 1</xref>). Among the n-grams with master votes, 23 were meta, 39 were modifier, and 5 were relation.</p>
			<table-wrap id="table1" position="float">
				<label>Table 1</label>
				<caption>
					<p>Number of n-grams with master votes and number of n-grams voted as CHV terms</p>
				</caption>
				<table width="505" border="1" cellpadding="7" cellspacing="0" rules="groups">
					<col width="112"/>
					<col width="177"/>
					<col width="172"/>
					<thead>
						<tr valign="top">
							<td>
								<bold>N-gram</bold>
							</td>
							<td>
								<bold>Number of Master Votes</bold>
							</td>
							<td>
								<bold>Number of CHV Terms</bold>
							</td>
						</tr>
					</thead>
					<tbody>
						<tr valign="top">
							<td>1-gram</td>
							<td>379</td>
							<td>261</td>
						</tr>
						<tr valign="top">
							<td>2-gram</td>
							<td>1101</td>
							<td>303</td>
						</tr>
						<tr valign="top">
							<td>3-gram</td>
							<td>356</td>
							<td>154</td>
						</tr>
						<tr valign="top">
							<td>4-gram</td>
							<td>57</td>
							<td>35</td>
						</tr>
						<tr valign="top">
							<td>
								<bold>Total</bold>
							</td>
							<td>
								<bold>1893</bold>
							</td>
							<td>
								<bold>753</bold>
							</td>
						</tr>
					</tbody>
				</table>
			</table-wrap>

				<fig id="figure2" position="float">
					<label>Figure 2</label>
					<caption><p>The logistic regression model</p></caption>
					<graphic xlink:href="jmir_v9i1e4_fig2.jpg" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
				</fig>

			<p> The logistic regression model is shown in <xref ref-type="fig" rid="figure2">Figure 2</xref>. In this logistic regression model, syntactic information (first 9 variables) and nesting pattern (last 3 variables) determine the termhood. The importance of syntactic information has long been recognized by models like the C-value. Conspicuously, word count and frequency are missing from our model, though longer and more frequent strings are more likely to be considered terms. To a large extent, length and frequency are reflected by the nesting patterns: very short strings are likely to be part of many nesting strings, and less frequent strings are likely to be coincidental combinations of more common words, meaning that it would have more nested strings.</p>
			<p>The ROC curves for C-value and the regression model are shown in <xref ref-type="fig" rid="figure3">Figure 3</xref>. The area under the ROC curve (AUC) is 70.9% for the C-value method and 95.5% for the regression model. Higher AUC signifies increased distinguishing power: 100% = perfect discriminative ability, 50% = no ability, &lt; 50% = predications were made in the wrong direction. Thus, the AUC results suggest the regression model to be very effective and better than the C-value for identifying CHV terms.</p>
			<p>
				<fig id="figure3" position="float">
					<label>Figure 3</label>
					<caption>
						<p>Curves for C-value and the regression model</p>
					</caption>
					<graphic xlink:href="jmir_v9i1e4_fig3.jpg" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
				</fig>
			</p>
		</sec>
		<sec sec-type="discussion">
			<title>Discussion</title>
			<p> This paper reports on several term identification methods for the OAC CHV project. We established a set of criteria and procedures to conduct a manual review, resulting in multiple reviewers reaching consensus on 1893 n-grams, including identification of 753 new terms for inclusion in the OAC CHV that were not in the 2004AC version of UMLS.</p>
			<p>The OAC termhood criteria were established collaboratively, reflecting the reviewers&#x2019; backgrounds in several different fields: controlled vocabulary, health informatics, linguistics, cognitive science, and computer science. While the OAC termhood criteria could be further refined and termhood criteria for health vocabularies are often not published, we believe publishing such criteria could benefit vocabulary research. For instance, many articles evaluate vocabularies and study methods of mapping one vocabulary to another [<xref ref-type="bibr" rid="ref28">28</xref>-<xref ref-type="bibr" rid="ref31">31</xref>]. These evaluations and mapping methods could be better guided by the termhood criteria of target vocabularies.</p>
			<p>In CHV research, the termhood issue is of particular importance because there has been limited discussion and little consensus on what should be considered a consumer term. Is &#x201C;sun poisoning&#x201D; an acceptable term? How about &#x201C;skin conditions?&#x201D; As was pointed out in the Introduction, health professional vocabularies do not always agree on the termhood of a phrase. Consumer expressions, however, require more scrutiny because it is harder to determine their semantics and contexts of usage.</p>
			<p>We tested two ATR methods (C-value and logistic regression) on the human-reviewed n-grams. The C-value was useful for determining termhood, though it did not have high distinguishing power (AUC = 70.9%). The AUC for the logistic regression model was 95.5%, which is fairly satisfactory.</p>
			<p>These results suggest that a specially fitted logistic regression model is better suited than the generic C-value method for the task of identifying CHV terms according to our criteria. The C-value method&#x2019;s performance problem was partially caused by issues unique to this data set, among them the inclusion of infrequent misspellings and the high frequency of most candidates, which made frequency a less reliable predicator. The imperfection in noun-phrase parsing is not unexpected, though the relatively short query string posed a greater challenge for parsing. Like many vocabularies, OAC includes strings that are single words and are not noun phrases, while C-value is typically calculated for multiword noun phrases.</p>
			<p>The logistic regression model demonstrated excellent suitability for OAC termhood determination. It may have to be altered to be used with other corpora or for other types of vocabularies due to the particularities of query-based corpus attributes such as the short length of the documents. Nonetheless, training of predictive models for a particular corpus and vocabulary is a generalizable strategy. Although general principles exist, the determination of which strings are to be considered legitimate vocabulary terms often depends on the domain and the vocabulary developers&#x2019; criteria (eg, including verb phrases [<xref ref-type="bibr" rid="ref15">15</xref>] or not).</p>
			<p>The regression model utilizes syntactic and nesting pattern features; both types of features are well-recognized termhood indicators. A concern often raised about CHV research is that the syntax and semantic of consumer phrases are too unruly to be represented in a computable vocabulary. The fact that many consumer phrases have common term characteristics suggests that they are tractable terms.</p>
			<p>Our study has several limitations. Because consumer utterances are not readily available as corpora of medical literature or clinical records, we used query logs that contained relatively few complete sentences. Subsequently, this resulted in many POS and noun phrase analysis errors. As well, we only had researchers and not lay consumers review the candidate terms, due to budget and logistic constraints. However, the analysis was based on utterances from queries submitted by tens of thousands of consumers.</p>
			<p>Based on the result of this study, we plan to apply the logistic regression model to the candidate n-grams and select those predicted to be terms for human review. We also plan to add the identified CHV terms to OAC. The authors associated with NLM are interested in investigating similar techniques to aid in identifying candidate terms for inclusion into the SPECIALIST Lexicon of the NLM, and for quality control.</p>
		</sec>
	</body>
	<back>
		<ack>
			<p>We thank the National Library of Medicine (NLM) for sharing the MedlinePlus query log data. This work is supported by the National Institutes of Health (NIH) grant R01 LM07222 and by the Intramural Research Program of the NIH, NLM.</p>
		</ack>
		<fn-group>
			<fn fn-type="conflict">
				<p>None declared.</p>
			</fn>
		</fn-group>
		<ref-list>
			<ref id="ref1">
				<label>1</label>
				<nlm-citation citation-type="journal" xlink:type="simple">
					<person-group person-group-type="author">
						<name name-style="western">
							<surname>Chute</surname>
							<given-names>C G</given-names>
						</name>
					</person-group>
					<article-title>Clinical classification and terminology: some history and current observations</article-title>
					<source>J Am Med Inform Assoc</source>
					<year>2000</year>
					<month>5</month>
					<volume>7</volume>
					<issue>3</issue>
					<fpage>298</fpage>
					<lpage>303</lpage>
					<comment>
						<ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://www.pubmedcentral.nih.gov/articlerender.fcgi?tool=pubmed&amp;pubmedid=10833167" ext-link-type="uri"/>
					</comment>
					<pub-id pub-id-type="medline">10833167</pub-id>
				</nlm-citation>
			</ref>
			<ref id="ref2">
				<label>2</label>
				<nlm-citation citation-type="journal" xlink:type="simple">
					<person-group person-group-type="author">
						<name name-style="western">
							<surname>Cimino</surname>
							<given-names>J J</given-names>
						</name>
					</person-group>
					<article-title>Desiderata for controlled medical vocabularies in the twenty-first century</article-title>
					<source>Methods Inf Med</source>
					<year>1998</year>
					<month>11</month>
					<volume>37</volume>
					<issue>4-5</issue>
					<fpage>394</fpage>
					<lpage>403</lpage>
					<pub-id pub-id-type="medline">9865037</pub-id>
				</nlm-citation>
			</ref>
			<ref id="ref3">
				<label>3</label>
				<nlm-citation citation-type="journal" xlink:type="simple">
					<person-group person-group-type="author">
						<name name-style="western">
							<surname>Zeng</surname>
							<given-names>Qing T</given-names>
						</name>
						<name name-style="western">
							<surname>Tse</surname>
							<given-names>Tony</given-names>
						</name>
					</person-group>
					<article-title>Exploring and developing consumer health vocabularies</article-title>
					<source>J Am Med Inform Assoc</source>
					<year>2006</year>
					<month>1</month>
					<volume>13</volume>
					<issue>1</issue>
					<fpage>24</fpage>
					<lpage>9</lpage>
					<comment>
						<ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://www.pubmedcentral.nih.gov/articlerender.fcgi?tool=pubmed&amp;pubmedid=16221948" ext-link-type="uri"/>
					</comment>
					<pub-id pub-id-type="medline">16221948</pub-id>
					<pub-id pub-id-type="pii">M1761</pub-id>
					<pub-id pub-id-type="doi">10.1197/jamia.M1761</pub-id>
				</nlm-citation>
			</ref>
			<ref id="ref4">
				<label>4</label>
				<nlm-citation citation-type="journal" xlink:type="simple">
					<person-group person-group-type="author">
						<name name-style="western">
							<surname>Brennan</surname>
							<given-names>Patricia Flatley</given-names>
						</name>
						<name name-style="western">
							<surname>Aronson</surname>
							<given-names>Alan R</given-names>
						</name>
					</person-group>
					<article-title>Towards linking patients and clinical information: detecting UMLS concepts in e-mail</article-title>
					<source>J Biomed Inform</source>
					<year>2003</year>
					<month>8</month>
					<volume>36</volume>
					<issue>4-5</issue>
					<fpage>334</fpage>
					<lpage>41</lpage>
					<pub-id pub-id-type="medline">14643729</pub-id>
					<pub-id pub-id-type="pii">S1532046403000984</pub-id>
					<pub-id pub-id-type="doi">10.1016/j.jbi.2003.09.017</pub-id>
				</nlm-citation>
			</ref>
			<ref id="ref5">
				<label>5</label>
				<nlm-citation citation-type="journal" xlink:type="simple">
					<person-group person-group-type="author">
						<name name-style="western">
							<surname>Zeng</surname>
							<given-names>Q</given-names>
						</name>
						<name name-style="western">
							<surname>Kogan</surname>
							<given-names>S</given-names>
						</name>
						<name name-style="western">
							<surname>Ash</surname>
							<given-names>N</given-names>
						</name>
						<name name-style="western">
							<surname>Greenes</surname>
							<given-names>R A</given-names>
						</name>
						<name name-style="western">
							<surname>Boxwala</surname>
							<given-names>A A</given-names>
						</name>
					</person-group>
					<article-title>Characteristics of consumer terminology for health information retrieval</article-title>
					<source>Methods Inf Med</source>
					<year>2002</year>
					<volume>41</volume>
					<issue>4</issue>
					<fpage>289</fpage>
					<lpage>98</lpage>
					<pub-id pub-id-type="medline">12425240</pub-id>
				</nlm-citation>
			</ref>
			<ref id="ref6">
				<label>6</label>
				<nlm-citation citation-type="journal" xlink:type="simple">
					<person-group person-group-type="author">
						<name name-style="western">
							<surname>Mccray</surname>
							<given-names>A T</given-names>
						</name>
						<name name-style="western">
							<surname>Loane</surname>
							<given-names>R F</given-names>
						</name>
						<name name-style="western">
							<surname>Browne</surname>
							<given-names>A C</given-names>
						</name>
						<name name-style="western">
							<surname>Bangalore</surname>
							<given-names>A K</given-names>
						</name>
					</person-group>
					<article-title>Terminology issues in user access to Web-based medical information</article-title>
					<source>Proc AMIA Symp</source>
					<year>1999</year>
					<fpage>107</fpage>
					<lpage>11</lpage>
					<comment>
						<ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://www.amia.org/pubs/symposia/D005626.PDF" ext-link-type="uri"/>
					</comment>
					<pub-id pub-id-type="medline">10566330</pub-id>
					<pub-id pub-id-type="pii">D005626</pub-id>
				</nlm-citation>
			</ref>
			<ref id="ref7">
				<label>7</label>
				<nlm-citation citation-type="book">
					<person-group person-group-type="author">
						<name>
							<surname>Tse</surname>
							<given-names>T</given-names>
						</name>
					</person-group>
					<source>Identifying and Characterizing a &#x201C;Consumer Medical Vocabulary</source>
					<year>2003</year>
					<publisher-loc>&#x201D; College Park</publisher-loc>
					<publisher-name>University of Maryland</publisher-name>
				</nlm-citation>
			</ref>
			<ref id="ref8">
				<label>8</label>
				<nlm-citation citation-type="journal" xlink:type="simple">
					<person-group person-group-type="author">
						<name name-style="western">
							<surname>Patrick</surname>
							<given-names>T B</given-names>
						</name>
						<name name-style="western">
							<surname>Monga</surname>
							<given-names>H K</given-names>
						</name>
						<name name-style="western">
							<surname>Sievert</surname>
							<given-names>M E</given-names>
						</name>
						<name name-style="western">
							<surname>Houston Hall</surname>
							<given-names>J</given-names>
						</name>
						<name name-style="western">
							<surname>Longo</surname>
							<given-names>D R</given-names>
						</name>
					</person-group>
					<article-title>Evaluation of controlled vocabulary resources for development of a consumer entry vocabulary for diabetes</article-title>
					<source>J Med Internet Res</source>
					<year>2001</year>
					<month>8</month>
					<day>28</day>
					<volume>3</volume>
					<issue>3</issue>
					<fpage>E24</fpage>
					<comment>
						<ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://www.jmir.org/2001/3/e24/" ext-link-type="uri"/>
					</comment>
					<pub-id pub-id-type="medline">11720966</pub-id>
					<pub-id pub-id-type="doi">10.2196/jmir.3.3.e24</pub-id>
				</nlm-citation>
			</ref>
			<ref id="ref9">
				<label>9</label>
				<nlm-citation citation-type="journal" xlink:type="simple">
					<person-group person-group-type="author">
						<name name-style="western">
							<surname>Smith</surname>
							<given-names>Catherine Arnott</given-names>
						</name>
						<name name-style="western">
							<surname>Stavri</surname>
							<given-names>P Zo&#xEB;</given-names>
						</name>
						<name name-style="western">
							<surname>Chapman</surname>
							<given-names>Wendy Webber</given-names>
						</name>
					</person-group>
					<article-title>In their own words? A terminological analysis of e-mail to a cancer information service</article-title>
					<source>Proc AMIA Symp</source>
					<year>2002</year>
					<fpage>697</fpage>
					<lpage>701</lpage>
					<pub-id pub-id-type="medline">12463914</pub-id>
					<pub-id pub-id-type="pii">D020002157</pub-id>
				</nlm-citation>
			</ref>
			<ref id="ref10">
				<label>10</label>
				<nlm-citation citation-type="journal" xlink:type="simple">
					<person-group person-group-type="author">
						<name name-style="western">
							<surname>Zielstorff</surname>
							<given-names>Rita D</given-names>
						</name>
					</person-group>
					<article-title>Controlled vocabularies for consumer health</article-title>
					<source>J Biomed Inform</source>
					<year>2003</year>
					<month>8</month>
					<volume>36</volume>
					<issue>4-5</issue>
					<fpage>326</fpage>
					<lpage>33</lpage>
					<pub-id pub-id-type="medline">14643728</pub-id>
					<pub-id pub-id-type="pii">S1532046403000960</pub-id>
					<pub-id pub-id-type="doi">10.1016/j.jbi.2003.09.015</pub-id>
				</nlm-citation>
			</ref>
			<ref id="ref11">
				<label>11</label>
				<nlm-citation citation-type="journal" xlink:type="simple">
					<person-group person-group-type="author">
						<name name-style="western">
							<surname>Krauthammer</surname>
							<given-names>Michael</given-names>
						</name>
						<name name-style="western">
							<surname>Nenadic</surname>
							<given-names>Goran</given-names>
						</name>
					</person-group>
					<article-title>Term identification in the biomedical literature</article-title>
					<source>J Biomed Inform</source>
					<year>2004</year>
					<month>12</month>
					<volume>37</volume>
					<issue>6</issue>
					<fpage>512</fpage>
					<lpage>26</lpage>
					<pub-id pub-id-type="medline">15542023</pub-id>
					<pub-id pub-id-type="pii">S1532-0464(04)00082-6</pub-id>
					<pub-id pub-id-type="doi">10.1016/j.jbi.2004.08.004</pub-id>
				</nlm-citation>
			</ref>
			<ref id="ref12">
				<label>12</label>
				<nlm-citation citation-type="journal" xlink:type="simple">
					<person-group person-group-type="author">
						<name name-style="western">
							<surname>Wermter</surname>
							<given-names>Joachim</given-names>
						</name>
						<name name-style="western">
							<surname>Hahn</surname>
							<given-names>Udo</given-names>
						</name>
					</person-group>
					<article-title>Effective grading of termhood in biomedical literature</article-title>
					<source>AMIA Annu Symp Proc</source>
					<year>2005</year>

					<fpage>809</fpage>
					<lpage>13</lpage>
					<pub-id pub-id-type="medline">16779152</pub-id>
					<pub-id pub-id-type="pii">57280</pub-id>
				</nlm-citation>
			</ref>
			<ref id="ref13">
				<label>13</label>
				<nlm-citation citation-type="book">
					<person-group person-group-type="author">
						<name>
							<surname>Church</surname>
							<given-names>K</given-names>
						</name>
						<name>
							<surname>Gale</surname>
							<given-names>W</given-names>
						</name>
						<name>
							<surname>Hanks</surname>
							<given-names>P</given-names>
						</name>
						<name>
							<surname>Hindle</surname>
							<given-names>D</given-names>
						</name>
					</person-group>
					<source>Using statistics in lexical analysis. In: Zernik U, editor. Lexical Acquisition: Exploiting On-Line Resources to Build a Lexicon</source>
					<year>1991</year>
					<publisher-loc>Hillsdale, NJ</publisher-loc>
					<publisher-name>Lawrence Erlbaum Associates, Inc</publisher-name>
				</nlm-citation>
			</ref>
			<ref id="ref14">
				<label>14</label>
				<nlm-citation citation-type="book">
					<person-group person-group-type="author">
						<name>
							<surname>Basili</surname>
							<given-names>R</given-names>
						</name>
						<name>
							<surname>Pazienza</surname>
							<given-names>MT</given-names>
						</name>
						<name>
							<surname>Zanzotto</surname>
							<given-names>FM</given-names>
						</name>
					</person-group>
					<article-title>Modelling the syntactic contextual information for term extraction.</article-title>
					<source>In: Conference on Recent Advances in Natural Language Processing (RANLP2001)</source>
					<year>2001</year>
					<month>09</month>
					<day>5</day>
					<publisher-loc>Tzigiv Chark, Bulgaria</publisher-loc>
				</nlm-citation>
			</ref>
			<ref id="ref15">
				<label>15</label>
				<nlm-citation citation-type="journal" xlink:type="simple">
					<person-group person-group-type="author">
						<name name-style="western">
							<surname>Harris</surname>
							<given-names>Marcelline R</given-names>
						</name>
						<name name-style="western">
							<surname>Savova</surname>
							<given-names>Guergana K</given-names>
						</name>
						<name name-style="western">
							<surname>Johnson</surname>
							<given-names>Thomas M</given-names>
						</name>
						<name name-style="western">
							<surname>Chute</surname>
							<given-names>Christopher G</given-names>
						</name>
					</person-group>
					<article-title>A term extraction tool for expanding content in the domain of functioning, disability, and health: proof of concept</article-title>
					<source>J Biomed Inform</source>
					<year>2003</year>
					<month>8</month>
					<volume>36</volume>
					<issue>4-5</issue>
					<fpage>250</fpage>
					<lpage>9</lpage>
					<pub-id pub-id-type="medline">14643720</pub-id>
					<pub-id pub-id-type="pii">S1532046403000868</pub-id>
					<pub-id pub-id-type="doi">10.1016/j.jbi.2003.09.005</pub-id>
				</nlm-citation>
			</ref>
			<ref id="ref16">
				<label>16</label>
				<nlm-citation citation-type="journal">
					<person-group person-group-type="author">
						<name>
							<surname>Frantzi</surname>
							<given-names>KT</given-names>
						</name>
						<name>
							<surname>Ananiadou</surname>
							<given-names>S</given-names>
						</name>
						<name>
							<surname>Mima</surname>
							<given-names>H</given-names>
						</name>
					</person-group>
					<article-title>Automatic recognition of multi-word terms: the C-value/NC-value method</article-title>
					<source>Int J on Digital Libraries</source>
					<year>2000</year>
					<volume>3</volume>
					<issue>2</issue>
					<fpage>115</fpage>
					<lpage>130</lpage>
					<pub-id pub-id-type="doi">10.1007/s007999900023</pub-id>
				</nlm-citation>
			</ref>
			<ref id="ref17">
				<label>17</label>
				<nlm-citation citation-type="journal" xlink:type="simple">
					<person-group person-group-type="author">
						<name name-style="western">
							<surname>Rindflesch</surname>
							<given-names>T C</given-names>
						</name>
						<name name-style="western">
							<surname>Hunter</surname>
							<given-names>L</given-names>
						</name>
						<name name-style="western">
							<surname>Aronson</surname>
							<given-names>A R</given-names>
						</name>
					</person-group>
					<article-title>Mining molecular binding terminology from biomedical text</article-title>
					<source>Proc AMIA Symp</source>
					<year>1999</year>
					<fpage>127</fpage>
					<lpage>31</lpage>
					<comment>
						<ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://www.amia.org/pubs/symposia/D005564.PDF" ext-link-type="uri"/>
					</comment>
					<pub-id pub-id-type="medline">10566334</pub-id>
					<pub-id pub-id-type="pii">D005564</pub-id>
				</nlm-citation>
			</ref>
			<ref id="ref18">
				<label>18</label>
				<nlm-citation citation-type="journal" xlink:type="simple">
					<person-group person-group-type="author">
						<name name-style="western">
							<surname>Fundel</surname>
							<given-names>Katrin</given-names>
						</name>
						<name name-style="western">
							<surname>G&#xFC;ttler</surname>
							<given-names>Daniel</given-names>
						</name>
						<name name-style="western">
							<surname>Zimmer</surname>
							<given-names>Ralf</given-names>
						</name>
						<name name-style="western">
							<surname>Apostolakis</surname>
							<given-names>Joannis</given-names>
						</name>
					</person-group>
					<article-title>A simple approach for protein name identification: prospects and limits</article-title>
					<source>BMC Bioinformatics</source>
					<year>2005</year>
					<volume>6 Suppl 1</volume>
					<issue>Suppl 1</issue>
					<fpage>S15</fpage>
					<comment>
						<ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://www.biomedcentral.com/1471-2105/6%20Suppl%201/S15" ext-link-type="uri"/>
					</comment>
					<pub-id pub-id-type="medline">15960827</pub-id>
					<pub-id pub-id-type="pii">1471-2105-6-S1-S15</pub-id>
					<pub-id pub-id-type="doi">10.1186/1471-2105-6-S1-S15</pub-id>
				</nlm-citation>
			</ref>
			<ref id="ref19">
				<label>19</label>
				<nlm-citation citation-type="journal" xlink:type="simple">
					<person-group person-group-type="author">
						<name name-style="western">
							<surname>Saric</surname>
							<given-names>Jasmin</given-names>
						</name>
						<name name-style="western">
							<surname>Jensen</surname>
							<given-names>Lars Juhl</given-names>
						</name>
						<name name-style="western">
							<surname>Ouzounova</surname>
							<given-names>Rossitza</given-names>
						</name>
						<name name-style="western">
							<surname>Rojas</surname>
							<given-names>Isabel</given-names>
						</name>
						<name name-style="western">
							<surname>Bork</surname>

							<given-names>Peer</given-names>
						</name>
					</person-group>
					<article-title>Extraction of regulatory gene/protein networks from Medline</article-title>
					<source>Bioinformatics</source>
					<year>2006</year>
					<month>3</month>
					<day>15</day>
					<volume>22</volume>
					<issue>6</issue>
					<fpage>645</fpage>
					<lpage>50</lpage>
					<pub-id pub-id-type="medline">16046493</pub-id>
					<pub-id pub-id-type="pii">bti597</pub-id>
					<pub-id pub-id-type="doi">10.1093/bioinformatics/bti597</pub-id>
				</nlm-citation>
			</ref>
			<ref id="ref20">
				<label>20</label>
				<nlm-citation citation-type="journal" xlink:type="simple">
					<person-group person-group-type="author">
						<name name-style="western">
							<surname>Saric</surname>
							<given-names>Jasmin</given-names>
						</name>
						<name name-style="western">
							<surname>Jensen</surname>
							<given-names>Lars J</given-names>
						</name>
						<name name-style="western">

							<surname>Rojas</surname>
							<given-names>Isabel</given-names>
						</name>
					</person-group>
					<article-title>Large-scale extraction of gene regulation for model organisms in an ontological context</article-title>
					<source>In Silico Biol</source>
					<year>2005</year>
					<volume>5</volume>
					<issue>1</issue>
					<fpage>21</fpage>
					<lpage>32</lpage>
					<comment>
						<ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://www.bioinfo.de/isb/2004050004" ext-link-type="uri"/>
					</comment>
					<pub-id pub-id-type="medline">15972005</pub-id>
					<pub-id pub-id-type="pii">2004050004</pub-id>
				</nlm-citation>
			</ref>
			<ref id="ref21">
				<label>21</label>
				<nlm-citation citation-type="journal" xlink:type="simple">
					<person-group person-group-type="author">
						<name name-style="western">
							<surname>Cohen</surname>
							<given-names>A M</given-names>
						</name>
						<name name-style="western">
							<surname>Hersh</surname>
							<given-names>W R</given-names>
						</name>
						<name name-style="western">
							<surname>Dubay</surname>
							<given-names>C</given-names>
						</name>
						<name name-style="western">
							<surname>Spackman</surname>
							<given-names>K</given-names>
						</name>
					</person-group>
					<article-title>Using co-occurrence network structure to extract synonymous gene and protein names from MEDLINE abstracts</article-title>
					<source>BMC Bioinformatics</source>
					<year>2005</year>
					<month>4</month>
					<volume>6</volume>
					<issue>1</issue>
					<fpage>103</fpage>
					<comment>
						<ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://www.biomedcentral.com/1471-2105/6/103" ext-link-type="uri"/>
					</comment>
					<pub-id pub-id-type="medline">15847682</pub-id>
					<pub-id pub-id-type="pii">1471-2105-6-103</pub-id>
					<pub-id pub-id-type="doi">10.1186/1471-2105-6-103</pub-id>
				</nlm-citation>
			</ref>
			<ref id="ref22">
				<label>22</label>
				<nlm-citation citation-type="journal" xlink:type="simple">
					<person-group person-group-type="author">
						<name name-style="western">
							<surname>Zeng</surname>
							<given-names>Qing T</given-names>
						</name>
						<name name-style="western">
							<surname>Tse</surname>
							<given-names>Tony</given-names>
						</name>
						<name name-style="western">
							<surname>Crowell</surname>

							<given-names>Jon</given-names>
						</name>
						<name name-style="western">
							<surname>Divita</surname>
							<given-names>Guy</given-names>
						</name>
						<name name-style="western">
							<surname>Roth</surname>
							<given-names>Laura</given-names>
						</name>
						<name name-style="western">
							<surname>Browne</surname>
							<given-names>Allen C</given-names>
						</name>
					</person-group>
					<article-title>Identifying consumer-friendly display (CFD) names for health concepts</article-title>
					<source>AMIA Annu Symp Proc</source>
					<year>2005</year>
					<fpage>859</fpage>
					<lpage>63</lpage>
				</nlm-citation>
			</ref>
			<ref id="ref23">
				<label>23</label>
				<nlm-citation citation-type="journal" xlink:type="simple">
					<person-group person-group-type="author">
						<name name-style="western">
							<surname>Crowell</surname>
							<given-names>Jon</given-names>
						</name>
						<name name-style="western">
							<surname>Zeng</surname>
							<given-names>Qing</given-names>
						</name>
						<name name-style="western">
							<surname>Tse</surname>
							<given-names>Tony</given-names>
						</name>
					</person-group>
					<article-title>A web application to support consumer health vocabulary development</article-title>
					<source>AMIA Annu Symp Proc</source>
					<year>2005</year>
					<fpage>932</fpage>
					<pub-id pub-id-type="medline">16779219</pub-id>
					<pub-id pub-id-type="pii">58637</pub-id>
				</nlm-citation>
			</ref>
			<ref id="ref24">
				<label>24</label>
				<nlm-citation citation-type="book">
					<person-group person-group-type="author">
						<name>
							<surname>Hepple</surname>
							<given-names>M</given-names>
						</name>
					</person-group>
					<source>Independence and Commitment: Assumptions for Rapid Training and Execution of Rule-based Part-of-Speech Taggers. Proceedings of the 38th Annual Meeting of the Association for Computational Linguistics (ACL-2000). Hong Kong, October 2000</source>
					<year>2000</year>
					<publisher-loc>Midtown, NJ</publisher-loc>
					<publisher-name>Association for Computational Linguistics</publisher-name>
					<fpage>278</fpage>
					<lpage>285</lpage>
				</nlm-citation>
			</ref>
			<ref id="ref25">
				<label>25</label>
				<nlm-citation citation-type="book">
					<person-group person-group-type="author">
						<name>
							<surname>Browne</surname>
							<given-names>A</given-names>
						</name>
						<name>
							<surname>McCray</surname>
							<given-names>A</given-names>
						</name>
						<name>
							<surname>Srinivasan</surname>
							<given-names>S</given-names>
						</name>
					</person-group>
					<source>The Specialist Lexicon. Report No NLM-LHC-93-1</source>
					<year>2000</year>
					<publisher-loc>Bethesda, MD</publisher-loc>
					<publisher-name>Lister Hill National Center for Biomedical Communications, National Library of Medicine</publisher-name>
				</nlm-citation>
			</ref>
			<ref id="ref26">
				<label>26</label>
				<nlm-citation citation-type="web">
					<source>Notes on Tagger Integration</source>
					<access-date>2006 Oct 23</access-date>
					<comment>
						<ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://mmtx.nlm.nih.gov/taggerNotes.shtml" ext-link-type="uri">http://mmtx.nlm.nih.gov/taggerNotes.shtml</ext-link>
					</comment>
				</nlm-citation>
			</ref>
			<ref id="ref27">
				<label>27</label>
				<nlm-citation citation-type="journal" xlink:type="simple">
					<person-group person-group-type="author">
						<name name-style="western">
							<surname>Hanley</surname>
							<given-names>J A</given-names>
						</name>
						<name name-style="western">
							<surname>Mcneil</surname>
							<given-names>B J</given-names>
						</name>
					</person-group>
					<article-title>The meaning and use of the area under a receiver operating characteristic (ROC) curve</article-title>
					<source>Radiology</source>
					<year>1982</year>
					<month>4</month>
					<volume>143</volume>
					<issue>1</issue>
					<fpage>29</fpage>
					<lpage>36</lpage>
					<pub-id pub-id-type="medline">7063747</pub-id>
				</nlm-citation>
			</ref>
			<ref id="ref28">
				<label>28</label>
				<nlm-citation citation-type="journal" xlink:type="simple">
					<person-group person-group-type="author">
						<name name-style="western">
							<surname>Aronson</surname>
							<given-names>A R</given-names>
						</name>
					</person-group>
					<article-title>Effective mapping of biomedical text to the UMLS Metathesaurus: the MetaMap program</article-title>
					<source>Proc AMIA Symp</source>
					<year>2001</year>
					<fpage>17</fpage>
					<lpage>21</lpage>
					<pub-id pub-id-type="medline">11825149</pub-id>
					<pub-id pub-id-type="pii">D010001275</pub-id>
				</nlm-citation>
			</ref>
			<ref id="ref29">
				<label>29</label>
				<nlm-citation citation-type="journal" xlink:type="simple">
					<person-group person-group-type="author">
						<name name-style="western">
							<surname>Cimino</surname>
							<given-names>J J</given-names>
						</name>
					</person-group>
					<article-title>Auditing the Unified Medical Language System with semantic methods</article-title>
					<source>J Am Med Inform Assoc</source>
					<year>1998</year>
					<month>1</month>
					<volume>5</volume>
					<issue>1</issue>
					<fpage>41</fpage>
					<lpage>51</lpage>
					<comment>
						<ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://www.pubmedcentral.nih.gov/articlerender.fcgi?tool=pubmed&amp;pubmedid=9452984" ext-link-type="uri"/>
					</comment>
					<pub-id pub-id-type="medline">9452984</pub-id>
				</nlm-citation>
			</ref>
			<ref id="ref30">
				<label>30</label>
				<nlm-citation citation-type="journal" xlink:type="simple">
					<person-group person-group-type="author">
						<name name-style="western">
							<surname>Bodenreider</surname>
							<given-names>O</given-names>
						</name>
						<name name-style="western">
							<surname>Nelson</surname>
							<given-names>S J</given-names>
						</name>
						<name name-style="western">
							<surname>Hole</surname>
							<given-names>W T</given-names>
						</name>
						<name name-style="western">
							<surname>Chang</surname>
							<given-names>H F</given-names>
						</name>
					</person-group>
					<article-title>Beyond synonymy: exploiting the UMLS semantics in mapping vocabularies</article-title>
					<source>Proc AMIA Symp</source>
					<year>1998</year>
					<fpage>815</fpage>
					<lpage>9</lpage>
					<pub-id pub-id-type="medline">9929332</pub-id>
				</nlm-citation>
			</ref>
			<ref id="ref31">
				<label>31</label>
				<nlm-citation citation-type="journal" xlink:type="simple">
					<person-group person-group-type="author">
						<name name-style="western">
							<surname>Humphreys</surname>
							<given-names>B L</given-names>
						</name>
						<name name-style="western">
							<surname>Mccray</surname>
							<given-names>A T</given-names>
						</name>
						<name name-style="western">
							<surname>Cheh</surname>
							<given-names>M L</given-names>
						</name>
					</person-group>
					<article-title>Evaluating the coverage of controlled health data terminologies: report on the results of the NLM/AHCPR large scale vocabulary test</article-title>
					<source>J Am Med Inform Assoc</source>
					<year>1997</year>
					<month>11</month>
					<volume>4</volume>
					<issue>6</issue>
					<fpage>484</fpage>
					<lpage>500</lpage>
					<comment>
						<ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://www.pubmedcentral.nih.gov/articlerender.fcgi?tool=pubmed&amp;pubmedid=9391936" ext-link-type="uri"/>
					</comment>
					<pub-id pub-id-type="medline">9391936</pub-id>
				</nlm-citation>
			</ref>
		</ref-list>
		<glossary>
				<title>Abbreviations</title>
				<def-list>
					<def-item>
						<term id="abb1">ATR</term>
						<def>
							<p>automated text recognition</p>
						</def>
					</def-item>
					<def-item>
						<term id="abb2">AUC</term>
						<def>
							<p>area under the curve</p>
						</def>
					</def-item>
					<def-item>
						<term id="abb3">CHV</term>
						<def>
							<p>consumer health vocabulary</p>
						</def>
					</def-item>
					<def-item>
						<term id="abb4">NIH</term>
						<def>
							<p>National Institutes of Health</p>
						</def>
					</def-item>
					<def-item>

						<term id="abb5">NLM</term>
						<def>
							<p>National Library of Medicine</p>
						</def>
					</def-item>
					<def-item>
						<term id="abb6">OAC</term>
						<def>
							<p>open access and collaborative</p>
						</def>
					</def-item>
					<def-item>
						<term id="abb7">POS</term>
						<def>
							<p>part of speech</p>
						</def>
					</def-item>
					<def-item>
						<term id="abb8">ROC</term>
						<def>
							<p>receiver operating characteristic</p>
						</def>
					</def-item>
					<def-item>
						<term id="abb9">UMLS</term>
						<def>
							<p>Unified Medical Language System</p>
						</def>
					</def-item>
				</def-list>
		</glossary>
	</back>
</article>

